32 files changed, 3598 insertions, 1470 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 532599b42e0d..01e724e22dcf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -73,8 +73,8 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
             },
             ORE);
         assert(OIC);
-        emitInlinedInto(ORE, CB->getDebugLoc(), CB->getParent(), F, *Caller,
-                        *OIC, false, DEBUG_TYPE);
+        emitInlinedIntoBasedOnCost(ORE, CB->getDebugLoc(), CB->getParent(), F,
+                                   *Caller, *OIC, false, DEBUG_TYPE);
 
         InlineFunctionInfo IFI(
             /*cg=*/nullptr, GetAssumptionCache, &PSI,
@@ -108,8 +108,10 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
   // Delete the non-comdat ones from the module and also from our vector.
   auto NonComdatBegin = partition(
       InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
-  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
+  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end())) {
     M.getFunctionList().erase(F);
+    Changed = true;
+  }
   InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
 
   if (!InlinedFunctions.empty()) {
@@ -117,8 +119,10 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
     // are not actually dead.
     filterDeadComdatFunctions(M, InlinedFunctions);
     // The remaining functions are actually dead.
-    for (Function *F : InlinedFunctions)
+    for (Function *F : InlinedFunctions) {
       M.getFunctionList().erase(F);
+      Changed = true;
+    }
   }
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index f670a101767e..93bb11433775 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -148,7 +148,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     } else if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+      ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo));
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -177,9 +177,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
         // Since loads will only have a single operand, and GEPs only a single
         // non-index operand, this will record direct loads without any indices,
         // and gep+loads with the GEP indices.
-        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
-             II != IE; ++II)
-          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        for (const Use &I : llvm::drop_begin(UI->operands()))
+          Indices.push_back(cast<ConstantInt>(I)->getSExtValue());
         // GEPs with a single 0 index can be merged with direct loads
         if (Indices.size() == 1 && Indices.front() == 0)
           Indices.clear();
@@ -231,8 +230,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
-                                       PAL.getRetAttributes(), ArgAttrVec));
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(),
+                                       PAL.getRetAttrs(), ArgAttrVec));
   ArgAttrVec.clear();
 
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
@@ -257,7 +256,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
          ++I, ++AI, ++ArgNo)
       if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
-        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+        ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
       } else if (ByValArgsToTransform.count(&*I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = I->getParamByValType();
@@ -313,9 +312,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
               IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
           newLoad->setAlignment(OrigLoad->getAlign());
           // Transfer the AA info too.
-          AAMDNodes AAInfo;
-          OrigLoad->getAAMetadata(AAInfo);
-          newLoad->setAAMetadata(AAInfo);
+          newLoad->setAAMetadata(OrigLoad->getAAMetadata());
 
           Args.push_back(newLoad);
           ArgAttrVec.push_back(AttributeSet());
@@ -325,7 +322,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     // Push any varargs arguments on the list.
     for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
       Args.push_back(*AI);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+      ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -341,9 +338,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       NewCS = NewCall;
     }
     NewCS->setCallingConv(CB.getCallingConv());
-    NewCS->setAttributes(
-        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
-                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->setAttributes(AttributeList::get(F->getContext(),
+                                            CallPAL.getFnAttrs(),
+                                            CallPAL.getRetAttrs(), ArgAttrVec));
     NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
     Args.clear();
     ArgAttrVec.clear();
@@ -1018,11 +1015,12 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
   do {
     LocalChange = false;
 
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
     for (LazyCallGraph::Node &N : C) {
       Function &OldF = N.getFunction();
 
-      FunctionAnalysisManager &FAM =
-          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
       // FIXME: This lambda must only be used with this function. We should
       // skip the lambda and just get the AA results directly.
       auto AARGetter = [&](Function &F) -> AAResults & {
@@ -1045,6 +1043,13 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
       C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
       FAM.clear(OldF, OldF.getName());
       OldF.eraseFromParent();
+
+      PreservedAnalyses FuncPA;
+      FuncPA.preserveSet<CFGAnalyses>();
+      for (auto *U : NewF->users()) {
+        auto *UserF = cast<CallBase>(U)->getFunction();
+        FAM.invalidate(*UserF, FuncPA);
+      }
     }
 
     Changed |= LocalChange;
@@ -1053,7 +1058,12 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  // We've cleared out analyses for deleted functions.
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  // We've manually invalidated analyses for functions we've modified.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  return PA;
 }
 
 namespace {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
index 91b16ec66ee3..edadc79e3a9f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -382,30 +382,30 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
 
   if (Attr.isEnumAttribute()) {
     Attribute::AttrKind Kind = Attr.getKindAsEnum();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
+    if (Attrs.hasAttributeAtIndex(AttrIdx, Kind))
       if (!ForceReplace &&
-          isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+          isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind)))
         return false;
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr);
     return true;
   }
   if (Attr.isStringAttribute()) {
     StringRef Kind = Attr.getKindAsString();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
+    if (Attrs.hasAttributeAtIndex(AttrIdx, Kind))
       if (!ForceReplace &&
-          isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+          isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind)))
         return false;
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr);
     return true;
   }
   if (Attr.isIntAttribute()) {
     Attribute::AttrKind Kind = Attr.getKindAsEnum();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
+    if (Attrs.hasAttributeAtIndex(AttrIdx, Kind))
       if (!ForceReplace &&
-          isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+          isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind)))
         return false;
-    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    Attrs = Attrs.removeAttributeAtIndex(Ctx, AttrIdx, Kind);
+    Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr);
     return true;
   }
 
@@ -658,9 +658,9 @@ bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK,
   else
     AttrList = getAssociatedFunction()->getAttributes();
 
-  bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK);
+  bool HasAttr = AttrList.hasAttributeAtIndex(getAttrIdx(), AK);
   if (HasAttr)
-    Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK));
+    Attrs.push_back(AttrList.getAttributeAtIndex(getAttrIdx(), AK));
   return HasAttr;
 }
 
@@ -1043,6 +1043,8 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
 
     if (auto *SI = dyn_cast<StoreInst>(U->getUser())) {
       if (&SI->getOperandUse(0) == U) {
+        if (!Visited.insert(U).second)
+          continue;
         SmallSetVector<Value *, 4> PotentialCopies;
         if (AA::getPotentialCopiesOfStoredValue(*this, *SI, PotentialCopies,
                                                 QueryingAA,
@@ -1121,6 +1123,10 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
       if (CE->isCast() && CE->getType()->isPointerTy() &&
           CE->getType()->getPointerElementType()->isFunctionTy()) {
+        LLVM_DEBUG(
+            dbgs() << "[Attributor] Use, is constant cast expression, add "
+                   << CE->getNumUses()
+                   << " uses of that expression instead!\n");
         for (const Use &CEU : CE->uses())
           Uses.push_back(&CEU);
         continue;
@@ -1141,9 +1147,13 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
     const Use *EffectiveUse =
         ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
     if (!ACS.isCallee(EffectiveUse)) {
-      if (!RequireAllCallSites)
+      if (!RequireAllCallSites) {
+        LLVM_DEBUG(dbgs() << "[Attributor] User " << *EffectiveUse->getUser()
+                          << " is not a call of " << Fn.getName()
+                          << ", skip use\n");
         continue;
-      LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
+      }
+      LLVM_DEBUG(dbgs() << "[Attributor] User " << *EffectiveUse->getUser()
                         << " is an invalid use of " << Fn.getName() << "\n");
       return false;
     }
@@ -1413,6 +1423,16 @@ void Attributor::runTillFixpoint() {
   } while (!Worklist.empty() && (IterationCounter++ < MaxFixedPointIterations ||
                                  VerifyMaxFixpointIterations));
 
+  if (IterationCounter > MaxFixedPointIterations && !Worklist.empty()) {
+    auto Remark = [&](OptimizationRemarkMissed ORM) {
+      return ORM << "Attributor did not reach a fixpoint after "
+                 << ore::NV("Iterations", MaxFixedPointIterations)
+                 << " iterations.";
+    };
+    Function *F = Worklist.front()->getIRPosition().getAssociatedFunction();
+    emitRemark<OptimizationRemarkMissed>(F, "FixedPoint", Remark);
+  }
+
   LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                     << IterationCounter << "/" << MaxFixpointIterations
                     << " iterations\n");
@@ -1922,7 +1942,7 @@ void Attributor::createShallowWrapper(Function &F) {
 
   CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
   CI->setTailCall(true);
-  CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+  CI->addFnAttr(Attribute::NoInline);
   ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
 
   NumFnShallowWrappersCreated++;
@@ -2015,7 +2035,8 @@ bool Attributor::isValidFunctionSignatureRewrite(
   if (!RewriteSignatures)
     return false;
 
-  auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
+  Function *Fn = Arg.getParent();
+  auto CallSiteCanBeChanged = [Fn](AbstractCallSite ACS) {
     // Forbid the call site to cast the function return type. If we need to
     // rewrite these functions we need to re-create a cast for the new call site
     // (if the old had uses).
@@ -2023,11 +2044,12 @@ bool Attributor::isValidFunctionSignatureRewrite(
         ACS.getInstruction()->getType() !=
             ACS.getCalledFunction()->getReturnType())
       return false;
+    if (ACS.getCalledOperand()->getType() != Fn->getType())
+      return false;
     // Forbid must-tail calls for now.
     return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall();
   };
 
-  Function *Fn = Arg.getParent();
   // Avoid var-arg functions for now.
   if (Fn->isVarArg()) {
     LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n");
@@ -2157,7 +2179,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       } else {
         NewArgumentTypes.push_back(Arg.getType());
         NewArgumentAttributes.push_back(
-            OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
+            OldFnAttributeList.getParamAttrs(Arg.getArgNo()));
       }
     }
 
@@ -2188,8 +2210,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     // the function.
     LLVMContext &Ctx = OldFn->getContext();
     NewFn->setAttributes(AttributeList::get(
-        Ctx, OldFnAttributeList.getFnAttributes(),
-        OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
+        Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(),
+        NewArgumentAttributes));
 
     // Since we have now created the new function, splice the body of the old
     // function right into the new function, leaving the old rotting hulk of the
@@ -2234,7 +2256,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
         } else {
           NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
           NewArgOperandAttributes.push_back(
-              OldCallAttributeList.getParamAttributes(OldArgNum));
+              OldCallAttributeList.getParamAttrs(OldArgNum));
         }
       }
 
@@ -2264,8 +2286,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       NewCB->setCallingConv(OldCB->getCallingConv());
       NewCB->takeName(OldCB);
       NewCB->setAttributes(AttributeList::get(
-          Ctx, OldCallAttributeList.getFnAttributes(),
-          OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
+          Ctx, OldCallAttributeList.getFnAttrs(),
+          OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes));
 
       CallSitePairs.push_back({OldCB, NewCB});
       return true;
@@ -2480,6 +2502,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
   // Every function can be "readnone/argmemonly/inaccessiblememonly/...".
   getOrCreateAAFor<AAMemoryLocation>(FPos);
 
+  // Every function can track active assumptions.
+  getOrCreateAAFor<AAAssumptionInfo>(FPos);
+
   // Every function might be applicable for Heap-To-Stack conversion.
   if (EnableHeapToStack)
     getOrCreateAAFor<AAHeapToStack>(FPos);
@@ -2565,6 +2590,7 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
   auto CallSitePred = [&](Instruction &I) -> bool {
     auto &CB = cast<CallBase>(I);
     IRPosition CBRetPos = IRPosition::callsite_returned(CB);
+    IRPosition CBFnPos = IRPosition::callsite_function(CB);
 
     // Call sites might be dead if they do not have side effects and no live
     // users. The return value might be dead if there are no live users.
@@ -2576,6 +2602,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     if (!Callee)
       return true;
 
+    // Every call site can track active assumptions.
+    getOrCreateAAFor<AAAssumptionInfo>(CBFnPos);
+
     // Skip declarations except if annotations on their call sites were
     // explicitly requested.
     if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
@@ -2588,7 +2617,7 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       getOrCreateAAFor<AAValueSimplify>(CBRetPos);
     }
 
-    for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) {
+    for (int I = 0, E = CB.arg_size(); I < E; ++I) {
 
       IRPosition CBArgPos = IRPosition::callsite_argument(CB, I);
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 3529923a9082..ec08287393de 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -28,6 +29,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Assumptions.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -146,6 +148,7 @@ PIPE_OPERATOR(AANoUndef)
 PIPE_OPERATOR(AACallEdges)
 PIPE_OPERATOR(AAFunctionReachability)
 PIPE_OPERATOR(AAPointerInfo)
+PIPE_OPERATOR(AAAssumptionInfo)
 
 #undef PIPE_OPERATOR
 
@@ -203,46 +206,25 @@ static Value *constructPointer(Type *ResTy, Type *PtrElemTy, Value *Ptr,
                     << "-bytes as " << *ResTy << "\n");
 
   if (Offset) {
-    SmallVector<Value *, 4> Indices;
-    std::string GEPName = Ptr->getName().str() + ".0";
-
-    // Add 0 index to look through the pointer.
-    assert((uint64_t)Offset < DL.getTypeAllocSize(PtrElemTy) &&
-           "Offset out of bounds");
-    Indices.push_back(Constant::getNullValue(IRB.getInt32Ty()));
-
     Type *Ty = PtrElemTy;
-    do {
-      auto *STy = dyn_cast<StructType>(Ty);
-      if (!STy)
-        // Non-aggregate type, we cast and make byte-wise progress now.
-        break;
-
-      const StructLayout *SL = DL.getStructLayout(STy);
-      if (int64_t(SL->getSizeInBytes()) < Offset)
-        break;
-
-      uint64_t Idx = SL->getElementContainingOffset(Offset);
-      assert(Idx < STy->getNumElements() && "Offset calculation error!");
-      uint64_t Rem = Offset - SL->getElementOffset(Idx);
-      Ty = STy->getElementType(Idx);
-
-      LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset
-                        << " Idx: " << Idx << " Rem: " << Rem << "\n");
+    APInt IntOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
+    SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(Ty, IntOffset);
 
-      GEPName += "." + std::to_string(Idx);
-      Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx));
-      Offset = Rem;
-    } while (Offset);
+    SmallVector<Value *, 4> ValIndices;
+    std::string GEPName = Ptr->getName().str();
+    for (const APInt &Index : IntIndices) {
+      ValIndices.push_back(IRB.getInt(Index));
+      GEPName += "." + std::to_string(Index.getZExtValue());
+    }
 
     // Create a GEP for the indices collected above.
-    Ptr = IRB.CreateGEP(PtrElemTy, Ptr, Indices, GEPName);
+    Ptr = IRB.CreateGEP(PtrElemTy, Ptr, ValIndices, GEPName);
 
     // If an offset is left we use byte-wise adjustment.
-    if (Offset) {
+    if (IntOffset != 0) {
       Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy());
-      Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt32(Offset),
-                          GEPName + ".b" + Twine(Offset));
+      Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(IntOffset),
+                          GEPName + ".b" + Twine(IntOffset.getZExtValue()));
     }
   }
 
@@ -431,6 +413,7 @@ const Value *stripAndAccumulateMinimalOffsets(
   };
 
   return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
+                                                /* AllowInvariant */ false,
                                                 AttributorAnalysis);
 }
 
@@ -503,6 +486,7 @@ static void clampReturnedValueStates(
     S ^= *T;
 }
 
+namespace {
 /// Helper class for generic deduction: return value -> returned position.
 template <typename AAType, typename BaseType,
           typename StateType = typename BaseType::StateType,
@@ -661,6 +645,7 @@ struct AACallSiteReturnedFromReturned : public BaseType {
     return clampStateAndIndicateChange(S, AA.getState());
   }
 };
+} // namespace
 
 /// Helper function to accumulate uses.
 template <class AAType, typename StateType = typename AAType::StateType>
@@ -1051,6 +1036,7 @@ private:
   BooleanState BS;
 };
 
+namespace {
 struct AAPointerInfoImpl
     : public StateWrapper<AA::PointerInfo::State, AAPointerInfo> {
   using BaseTy = StateWrapper<AA::PointerInfo::State, AAPointerInfo>;
@@ -1207,7 +1193,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         }
 
         SmallVector<Value *, 8> Indices;
-        for (Use &Idx : llvm::make_range(GEP->idx_begin(), GEP->idx_end())) {
+        for (Use &Idx : GEP->indices()) {
           if (auto *CIdx = dyn_cast<ConstantInt>(Idx)) {
             Indices.push_back(CIdx);
             continue;
@@ -1244,7 +1230,11 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         }
 
         // Check if the PHI operand is not dependent on the PHI itself.
-        APInt Offset(DL.getIndexTypeSizeInBits(AssociatedValue.getType()), 0);
+        // TODO: This is not great as we look at the pointer type. However, it
+        // is unclear where the Offset size comes from with typeless pointers.
+        APInt Offset(
+            DL.getIndexSizeInBits(CurPtr->getType()->getPointerAddressSpace()),
+            0);
         if (&AssociatedValue == CurPtr->stripAndAccumulateConstantOffsets(
                                     DL, Offset, /* AllowNonInbounds */ true)) {
           if (Offset != PtrOI.Offset) {
@@ -2432,6 +2422,10 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
     const size_t NoUBPrevSize = AssumedNoUBInsts.size();
 
     auto InspectMemAccessInstForUB = [&](Instruction &I) {
+      // Lang ref now states volatile store is not UB, let's skip them.
+      if (I.isVolatile() && I.mayWriteToMemory())
+        return true;
+
       // Skip instructions that are already saved.
       if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
         return true;
@@ -2511,7 +2505,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       Function *Callee = CB.getCalledFunction();
       if (!Callee)
         return true;
-      for (unsigned idx = 0; idx < CB.getNumArgOperands(); idx++) {
+      for (unsigned idx = 0; idx < CB.arg_size(); idx++) {
         // If current argument is known to be simplified to null pointer and the
         // corresponding argument position is known to have nonnull attribute,
         // the argument is poison. Furthermore, if the argument is poison and
@@ -3179,8 +3173,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
     // value passed at this call site.
     // TODO: AbstractCallSite
     const auto &CB = cast<CallBase>(getAnchorValue());
-    for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands();
-         OtherArgNo++)
+    for (unsigned OtherArgNo = 0; OtherArgNo < CB.arg_size(); OtherArgNo++)
       if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo))
         return false;
 
@@ -3398,6 +3391,10 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
   }
 
   bool isDeadStore(Attributor &A, StoreInst &SI) {
+    // Lang ref now states volatile store is not UB/dead, let's skip them.
+    if (SI.isVolatile())
+      return false;
+
     bool UsedAssumedInformation = false;
     SmallSetVector<Value *, 4> PotentialCopies;
     if (!AA::getPotentialCopiesOfStoredValue(A, SI, PotentialCopies, *this,
@@ -5083,6 +5080,7 @@ struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
     STATS_DECLTRACK_CSRET_ATTR(nocapture)
   }
 };
+} // namespace
 
 /// ------------------ Value Simplify Attribute ----------------------------
 
@@ -5103,6 +5101,7 @@ bool ValueSimplifyStateType::unionAssumed(Optional<Value *> Other) {
   return true;
 }
 
+namespace {
 struct AAValueSimplifyImpl : AAValueSimplify {
   AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
       : AAValueSimplify(IRP, A) {}
@@ -6508,7 +6507,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) {
       CallBase *DC = cast<CallBase>(ACS.getInstruction());
       int DCArgNo = ACS.getCallArgOperandNo(ArgNo);
-      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() &&
+      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->arg_size() &&
              "Expected a direct call operand for callback call operand");
 
       LLVM_DEBUG({
@@ -7331,10 +7330,12 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U,
 
   case Instruction::Store:
     // Stores cause the NO_WRITES property to disappear if the use is the
-    // pointer operand. Note that we do assume that capturing was taken care of
-    // somewhere else.
+    // pointer operand. Note that while capturing was taken care of somewhere
+    // else we need to deal with stores of the value that is not looked through.
     if (cast<StoreInst>(UserI)->getPointerOperand() == U.get())
       removeAssumedBits(NO_WRITES);
+    else
+      indicatePessimisticFixpoint();
     return;
 
   case Instruction::Call:
@@ -7380,6 +7381,7 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U,
   if (UserI->mayWriteToMemory())
     removeAssumedBits(NO_WRITES);
 }
+} // namespace
 
 /// -------------------- Memory Locations Attributes ---------------------------
 /// Includes read-none, argmemonly, inaccessiblememonly,
@@ -7672,11 +7674,14 @@ void AAMemoryLocationImpl::categorizePtrValue(
     assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped.");
     if (isa<UndefValue>(Obj))
       continue;
-    if (auto *Arg = dyn_cast<Argument>(Obj)) {
-      if (Arg->hasByValAttr())
-        MLK = NO_LOCAL_MEM;
-      else
-        MLK = NO_ARGUMENT_MEM;
+    if (isa<Argument>(Obj)) {
+      // TODO: For now we do not treat byval arguments as local copies performed
+      // on the call edge, though, we should. To make that happen we need to
+      // teach various passes, e.g., DSE, about the copy effect of a byval. That
+      // would also allow us to mark functions only accessing byval arguments as
+      // readnone again, atguably their acceses have no effect outside of the
+      // function, like accesses to allocas.
+      MLK = NO_ARGUMENT_MEM;
     } else if (auto *GV = dyn_cast<GlobalValue>(Obj)) {
       // Reading constant memory is not treated as a read "effect" by the
       // function attr pass so we won't neither. Constants defined by TBAA are
@@ -7722,7 +7727,7 @@ void AAMemoryLocationImpl::categorizePtrValue(
 void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
     Attributor &A, CallBase &CB, AAMemoryLocation::StateType &AccessedLocs,
     bool &Changed) {
-  for (unsigned ArgNo = 0, E = CB.getNumArgOperands(); ArgNo < E; ++ArgNo) {
+  for (unsigned ArgNo = 0, E = CB.arg_size(); ArgNo < E; ++ArgNo) {
 
     // Skip non-pointer arguments.
     const Value *ArgOp = CB.getArgOperand(ArgNo);
@@ -8655,31 +8660,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
 
   static bool calculateICmpInst(const ICmpInst *ICI, const APInt &LHS,
                                 const APInt &RHS) {
-    ICmpInst::Predicate Pred = ICI->getPredicate();
-    switch (Pred) {
-    case ICmpInst::ICMP_UGT:
-      return LHS.ugt(RHS);
-    case ICmpInst::ICMP_SGT:
-      return LHS.sgt(RHS);
-    case ICmpInst::ICMP_EQ:
-      return LHS.eq(RHS);
-    case ICmpInst::ICMP_UGE:
-      return LHS.uge(RHS);
-    case ICmpInst::ICMP_SGE:
-      return LHS.sge(RHS);
-    case ICmpInst::ICMP_ULT:
-      return LHS.ult(RHS);
-    case ICmpInst::ICMP_SLT:
-      return LHS.slt(RHS);
-    case ICmpInst::ICMP_NE:
-      return LHS.ne(RHS);
-    case ICmpInst::ICMP_ULE:
-      return LHS.ule(RHS);
-    case ICmpInst::ICMP_SLE:
-      return LHS.sle(RHS);
-    default:
-      llvm_unreachable("Invalid ICmp predicate!");
-    }
+    return ICmpInst::compare(LHS, RHS, ICI->getPredicate());
   }
 
   static APInt calculateCastInst(const CastInst *CI, const APInt &Src,
@@ -8719,25 +8700,25 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     case Instruction::Mul:
       return LHS * RHS;
     case Instruction::UDiv:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
       return LHS.udiv(RHS);
     case Instruction::SDiv:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
       return LHS.sdiv(RHS);
     case Instruction::URem:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
       return LHS.urem(RHS);
     case Instruction::SRem:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
@@ -9336,32 +9317,69 @@ struct AANoUndefCallSiteReturned final
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) }
 };
 
-struct AACallEdgesFunction : public AACallEdges {
-  AACallEdgesFunction(const IRPosition &IRP, Attributor &A)
-      : AACallEdges(IRP, A) {}
+struct AACallEdgesImpl : public AACallEdges {
+  AACallEdgesImpl(const IRPosition &IRP, Attributor &A) : AACallEdges(IRP, A) {}
 
+  virtual const SetVector<Function *> &getOptimisticEdges() const override {
+    return CalledFunctions;
+  }
+
+  virtual bool hasUnknownCallee() const override { return HasUnknownCallee; }
+
+  virtual bool hasNonAsmUnknownCallee() const override {
+    return HasUnknownCalleeNonAsm;
+  }
+
+  const std::string getAsStr() const override {
+    return "CallEdges[" + std::to_string(HasUnknownCallee) + "," +
+           std::to_string(CalledFunctions.size()) + "]";
+  }
+
+  void trackStatistics() const override {}
+
+protected:
+  void addCalledFunction(Function *Fn, ChangeStatus &Change) {
+    if (CalledFunctions.insert(Fn)) {
+      Change = ChangeStatus::CHANGED;
+      LLVM_DEBUG(dbgs() << "[AACallEdges] New call edge: " << Fn->getName()
+                        << "\n");
+    }
+  }
+
+  void setHasUnknownCallee(bool NonAsm, ChangeStatus &Change) {
+    if (!HasUnknownCallee)
+      Change = ChangeStatus::CHANGED;
+    if (NonAsm && !HasUnknownCalleeNonAsm)
+      Change = ChangeStatus::CHANGED;
+    HasUnknownCalleeNonAsm |= NonAsm;
+    HasUnknownCallee = true;
+  }
+
+private:
+  /// Optimistic set of functions that might be called by this position.
+  SetVector<Function *> CalledFunctions;
+
+  /// Is there any call with a unknown callee.
+  bool HasUnknownCallee = false;
+
+  /// Is there any call with a unknown callee, excluding any inline asm.
+  bool HasUnknownCalleeNonAsm = false;
+};
+
+struct AACallEdgesCallSite : public AACallEdgesImpl {
+  AACallEdgesCallSite(const IRPosition &IRP, Attributor &A)
+      : AACallEdgesImpl(IRP, A) {}
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Change = ChangeStatus::UNCHANGED;
-    bool OldHasUnknownCallee = HasUnknownCallee;
-    bool OldHasUnknownCalleeNonAsm = HasUnknownCalleeNonAsm;
-
-    auto AddCalledFunction = [&](Function *Fn) {
-      if (CalledFunctions.insert(Fn)) {
-        Change = ChangeStatus::CHANGED;
-        LLVM_DEBUG(dbgs() << "[AACallEdges] New call edge: " << Fn->getName()
-                          << "\n");
-      }
-    };
 
     auto VisitValue = [&](Value &V, const Instruction *CtxI, bool &HasUnknown,
                           bool Stripped) -> bool {
       if (Function *Fn = dyn_cast<Function>(&V)) {
-        AddCalledFunction(Fn);
+        addCalledFunction(Fn, Change);
       } else {
         LLVM_DEBUG(dbgs() << "[AACallEdges] Unrecognized value: " << V << "\n");
-        HasUnknown = true;
-        HasUnknownCalleeNonAsm = true;
+        setHasUnknownCallee(true, Change);
       }
 
       // Explore all values.
@@ -9369,44 +9387,67 @@ struct AACallEdgesFunction : public AACallEdges {
     };
 
     // Process any value that we might call.
-    auto ProcessCalledOperand = [&](Value *V, Instruction *Ctx) {
+    auto ProcessCalledOperand = [&](Value *V) {
+      bool DummyValue = false;
       if (!genericValueTraversal<bool>(A, IRPosition::value(*V), *this,
-                                       HasUnknownCallee, VisitValue, nullptr,
+                                       DummyValue, VisitValue, nullptr,
                                        false)) {
         // If we haven't gone through all values, assume that there are unknown
         // callees.
-        HasUnknownCallee = true;
-        HasUnknownCalleeNonAsm = true;
+        setHasUnknownCallee(true, Change);
       }
     };
 
-    auto ProcessCallInst = [&](Instruction &Inst) {
-      CallBase &CB = static_cast<CallBase &>(Inst);
-      if (CB.isInlineAsm()) {
-        HasUnknownCallee = true;
-        return true;
-      }
+    CallBase *CB = static_cast<CallBase *>(getCtxI());
 
-      // Process callee metadata if available.
-      if (auto *MD = Inst.getMetadata(LLVMContext::MD_callees)) {
-        for (auto &Op : MD->operands()) {
-          Function *Callee = mdconst::extract_or_null<Function>(Op);
-          if (Callee)
-            AddCalledFunction(Callee);
-        }
-        // Callees metadata grantees that the called function is one of its
-        // operands, So we are done.
-        return true;
+    if (CB->isInlineAsm()) {
+      setHasUnknownCallee(false, Change);
+      return Change;
+    }
+
+    // Process callee metadata if available.
+    if (auto *MD = getCtxI()->getMetadata(LLVMContext::MD_callees)) {
+      for (auto &Op : MD->operands()) {
+        Function *Callee = mdconst::dyn_extract_or_null<Function>(Op);
+        if (Callee)
+          addCalledFunction(Callee, Change);
       }
+      return Change;
+    }
 
-      // The most simple case.
-      ProcessCalledOperand(CB.getCalledOperand(), &Inst);
+    // The most simple case.
+    ProcessCalledOperand(CB->getCalledOperand());
 
-      // Process callback functions.
-      SmallVector<const Use *, 4u> CallbackUses;
-      AbstractCallSite::getCallbackUses(CB, CallbackUses);
-      for (const Use *U : CallbackUses)
-        ProcessCalledOperand(U->get(), &Inst);
+    // Process callback functions.
+    SmallVector<const Use *, 4u> CallbackUses;
+    AbstractCallSite::getCallbackUses(*CB, CallbackUses);
+    for (const Use *U : CallbackUses)
+      ProcessCalledOperand(U->get());
+
+    return Change;
+  }
+};
+
+struct AACallEdgesFunction : public AACallEdgesImpl {
+  AACallEdgesFunction(const IRPosition &IRP, Attributor &A)
+      : AACallEdgesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto ProcessCallInst = [&](Instruction &Inst) {
+      CallBase &CB = static_cast<CallBase &>(Inst);
+
+      auto &CBEdges = A.getAAFor<AACallEdges>(
+          *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
+      if (CBEdges.hasNonAsmUnknownCallee())
+        setHasUnknownCallee(true, Change);
+      if (CBEdges.hasUnknownCallee())
+        setHasUnknownCallee(false, Change);
+
+      for (Function *F : CBEdges.getOptimisticEdges())
+        addCalledFunction(F, Change);
 
       return true;
     };
@@ -9417,155 +9458,323 @@ struct AACallEdgesFunction : public AACallEdges {
                                            UsedAssumedInformation)) {
       // If we haven't looked at all call like instructions, assume that there
       // are unknown callees.
-      HasUnknownCallee = true;
-      HasUnknownCalleeNonAsm = true;
+      setHasUnknownCallee(true, Change);
     }
 
-    // Track changes.
-    if (OldHasUnknownCallee != HasUnknownCallee ||
-        OldHasUnknownCalleeNonAsm != HasUnknownCalleeNonAsm)
-      Change = ChangeStatus::CHANGED;
-
     return Change;
   }
+};
 
-  virtual const SetVector<Function *> &getOptimisticEdges() const override {
-    return CalledFunctions;
-  };
+struct AAFunctionReachabilityFunction : public AAFunctionReachability {
+private:
+  struct QuerySet {
+    void markReachable(Function *Fn) {
+      Reachable.insert(Fn);
+      Unreachable.erase(Fn);
+    }
+
+    ChangeStatus update(Attributor &A, const AAFunctionReachability &AA,
+                        ArrayRef<const AACallEdges *> AAEdgesList) {
+      ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+      for (auto *AAEdges : AAEdgesList) {
+        if (AAEdges->hasUnknownCallee()) {
+          if (!CanReachUnknownCallee)
+            Change = ChangeStatus::CHANGED;
+          CanReachUnknownCallee = true;
+          return Change;
+        }
+      }
 
-  virtual bool hasUnknownCallee() const override { return HasUnknownCallee; }
+      for (Function *Fn : make_early_inc_range(Unreachable)) {
+        if (checkIfReachable(A, AA, AAEdgesList, Fn)) {
+          Change = ChangeStatus::CHANGED;
+          markReachable(Fn);
+        }
+      }
+      return Change;
+    }
 
-  virtual bool hasNonAsmUnknownCallee() const override {
-    return HasUnknownCalleeNonAsm;
-  }
+    bool isReachable(Attributor &A, const AAFunctionReachability &AA,
+                     ArrayRef<const AACallEdges *> AAEdgesList, Function *Fn) {
+      // Assume that we can reach the function.
+      // TODO: Be more specific with the unknown callee.
+      if (CanReachUnknownCallee)
+        return true;
 
-  const std::string getAsStr() const override {
-    return "CallEdges[" + std::to_string(HasUnknownCallee) + "," +
-           std::to_string(CalledFunctions.size()) + "]";
-  }
+      if (Reachable.count(Fn))
+        return true;
 
-  void trackStatistics() const override {}
+      if (Unreachable.count(Fn))
+        return false;
 
-  /// Optimistic set of functions that might be called by this function.
-  SetVector<Function *> CalledFunctions;
+      // We need to assume that this function can't reach Fn to prevent
+      // an infinite loop if this function is recursive.
+      Unreachable.insert(Fn);
 
-  /// Is there any call with a unknown callee.
-  bool HasUnknownCallee = false;
+      bool Result = checkIfReachable(A, AA, AAEdgesList, Fn);
+      if (Result)
+        markReachable(Fn);
+      return Result;
+    }
 
-  /// Is there any call with a unknown callee, excluding any inline asm.
-  bool HasUnknownCalleeNonAsm = false;
-};
+    bool checkIfReachable(Attributor &A, const AAFunctionReachability &AA,
+                          ArrayRef<const AACallEdges *> AAEdgesList,
+                          Function *Fn) const {
 
-struct AAFunctionReachabilityFunction : public AAFunctionReachability {
-  AAFunctionReachabilityFunction(const IRPosition &IRP, Attributor &A)
-      : AAFunctionReachability(IRP, A) {}
+      // Handle the most trivial case first.
+      for (auto *AAEdges : AAEdgesList) {
+        const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges();
 
-  bool canReach(Attributor &A, Function *Fn) const override {
-    // Assume that we can reach any function if we can reach a call with
-    // unknown callee.
-    if (CanReachUnknownCallee)
-      return true;
+        if (Edges.count(Fn))
+          return true;
+      }
 
-    if (ReachableQueries.count(Fn))
-      return true;
+      SmallVector<const AAFunctionReachability *, 8> Deps;
+      for (auto &AAEdges : AAEdgesList) {
+        const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges();
+
+        for (Function *Edge : Edges) {
+          // We don't need a dependency if the result is reachable.
+          const AAFunctionReachability &EdgeReachability =
+              A.getAAFor<AAFunctionReachability>(
+                  AA, IRPosition::function(*Edge), DepClassTy::NONE);
+          Deps.push_back(&EdgeReachability);
+
+          if (EdgeReachability.canReach(A, Fn))
+            return true;
+        }
+      }
+
+      // The result is false for now, set dependencies and leave.
+      for (auto Dep : Deps)
+        A.recordDependence(AA, *Dep, DepClassTy::REQUIRED);
 
-    if (UnreachableQueries.count(Fn))
       return false;
+    }
+
+    /// Set of functions that we know for sure is reachable.
+    DenseSet<Function *> Reachable;
+
+    /// Set of functions that are unreachable, but might become reachable.
+    DenseSet<Function *> Unreachable;
+
+    /// If we can reach a function with a call to a unknown function we assume
+    /// that we can reach any function.
+    bool CanReachUnknownCallee = false;
+  };
 
+public:
+  AAFunctionReachabilityFunction(const IRPosition &IRP, Attributor &A)
+      : AAFunctionReachability(IRP, A) {}
+
+  bool canReach(Attributor &A, Function *Fn) const override {
     const AACallEdges &AAEdges =
         A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::REQUIRED);
 
-    const SetVector<Function *> &Edges = AAEdges.getOptimisticEdges();
-    bool Result = checkIfReachable(A, Edges, Fn);
+    // Attributor returns attributes as const, so this function has to be
+    // const for users of this attribute to use it without having to do
+    // a const_cast.
+    // This is a hack for us to be able to cache queries.
+    auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this);
+    bool Result =
+        NonConstThis->WholeFunction.isReachable(A, *this, {&AAEdges}, Fn);
+
+    return Result;
+  }
+
+  /// Can \p CB reach \p Fn
+  bool canReach(Attributor &A, CallBase &CB, Function *Fn) const override {
+    const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+        *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
 
     // Attributor returns attributes as const, so this function has to be
     // const for users of this attribute to use it without having to do
     // a const_cast.
     // This is a hack for us to be able to cache queries.
     auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this);
+    QuerySet &CBQuery = NonConstThis->CBQueries[&CB];
 
-    if (Result)
-      NonConstThis->ReachableQueries.insert(Fn);
-    else
-      NonConstThis->UnreachableQueries.insert(Fn);
+    bool Result = CBQuery.isReachable(A, *this, {&AAEdges}, Fn);
 
     return Result;
   }
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    if (CanReachUnknownCallee)
-      return ChangeStatus::UNCHANGED;
-
     const AACallEdges &AAEdges =
         A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::REQUIRED);
-    const SetVector<Function *> &Edges = AAEdges.getOptimisticEdges();
     ChangeStatus Change = ChangeStatus::UNCHANGED;
 
-    if (AAEdges.hasUnknownCallee()) {
-      bool OldCanReachUnknown = CanReachUnknownCallee;
-      CanReachUnknownCallee = true;
-      return OldCanReachUnknown ? ChangeStatus::UNCHANGED
-                                : ChangeStatus::CHANGED;
-    }
+    Change |= WholeFunction.update(A, *this, {&AAEdges});
 
-    // Check if any of the unreachable functions become reachable.
-    for (auto Current = UnreachableQueries.begin();
-         Current != UnreachableQueries.end();) {
-      if (!checkIfReachable(A, Edges, *Current)) {
-        Current++;
-        continue;
-      }
-      ReachableQueries.insert(*Current);
-      UnreachableQueries.erase(*Current++);
-      Change = ChangeStatus::CHANGED;
+    for (auto CBPair : CBQueries) {
+      const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+          *this, IRPosition::callsite_function(*CBPair.first),
+          DepClassTy::REQUIRED);
+
+      Change |= CBPair.second.update(A, *this, {&AAEdges});
     }
 
     return Change;
   }
 
   const std::string getAsStr() const override {
-    size_t QueryCount = ReachableQueries.size() + UnreachableQueries.size();
+    size_t QueryCount =
+        WholeFunction.Reachable.size() + WholeFunction.Unreachable.size();
 
-    return "FunctionReachability [" + std::to_string(ReachableQueries.size()) +
-           "," + std::to_string(QueryCount) + "]";
+    return "FunctionReachability [" +
+           std::to_string(WholeFunction.Reachable.size()) + "," +
+           std::to_string(QueryCount) + "]";
   }
 
   void trackStatistics() const override {}
 
 private:
-  bool canReachUnknownCallee() const override { return CanReachUnknownCallee; }
+  bool canReachUnknownCallee() const override {
+    return WholeFunction.CanReachUnknownCallee;
+  }
 
-  bool checkIfReachable(Attributor &A, const SetVector<Function *> &Edges,
-                        Function *Fn) const {
-    if (Edges.count(Fn))
-      return true;
+  /// Used to answer if a the whole function can reacha a specific function.
+  QuerySet WholeFunction;
 
-    for (Function *Edge : Edges) {
-      // We don't need a dependency if the result is reachable.
-      const AAFunctionReachability &EdgeReachability =
-          A.getAAFor<AAFunctionReachability>(*this, IRPosition::function(*Edge),
-                                             DepClassTy::NONE);
+  /// Used to answer if a call base inside this function can reach a specific
+  /// function.
+  DenseMap<CallBase *, QuerySet> CBQueries;
+};
 
-      if (EdgeReachability.canReach(A, Fn))
-        return true;
-    }
-    for (Function *Fn : Edges)
-      A.getAAFor<AAFunctionReachability>(*this, IRPosition::function(*Fn),
-                                         DepClassTy::REQUIRED);
+/// ---------------------- Assumption Propagation ------------------------------
+struct AAAssumptionInfoImpl : public AAAssumptionInfo {
+  AAAssumptionInfoImpl(const IRPosition &IRP, Attributor &A,
+                       const DenseSet<StringRef> &Known)
+      : AAAssumptionInfo(IRP, A, Known) {}
 
-    return false;
+  bool hasAssumption(const StringRef Assumption) const override {
+    return isValidState() && setContains(Assumption);
   }
 
-  /// Set of functions that we know for sure is reachable.
-  SmallPtrSet<Function *, 8> ReachableQueries;
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    const SetContents &Known = getKnown();
+    const SetContents &Assumed = getAssumed();
+
+    const std::string KnownStr =
+        llvm::join(Known.getSet().begin(), Known.getSet().end(), ",");
+    const std::string AssumedStr =
+        (Assumed.isUniversal())
+            ? "Universal"
+            : llvm::join(Assumed.getSet().begin(), Assumed.getSet().end(), ",");
+
+    return "Known [" + KnownStr + "]," + " Assumed [" + AssumedStr + "]";
+  }
+};
+
+/// Propagates assumption information from parent functions to all of their
+/// successors. An assumption can be propagated if the containing function
+/// dominates the called function.
+///
+/// We start with a "known" set of assumptions already valid for the associated
+/// function and an "assumed" set that initially contains all possible
+/// assumptions. The assumed set is inter-procedurally updated by narrowing its
+/// contents as concrete values are known. The concrete values are seeded by the
+/// first nodes that are either entries into the call graph, or contains no
+/// assumptions. Each node is updated as the intersection of the assumed state
+/// with all of its predecessors.
+struct AAAssumptionInfoFunction final : AAAssumptionInfoImpl {
+  AAAssumptionInfoFunction(const IRPosition &IRP, Attributor &A)
+      : AAAssumptionInfoImpl(IRP, A,
+                             getAssumptions(*IRP.getAssociatedFunction())) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    const auto &Assumptions = getKnown();
+
+    // Don't manifest a universal set if it somehow made it here.
+    if (Assumptions.isUniversal())
+      return ChangeStatus::UNCHANGED;
+
+    Function *AssociatedFunction = getAssociatedFunction();
+
+    bool Changed = addAssumptions(*AssociatedFunction, Assumptions.getSet());
+
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool Changed = false;
+
+    auto CallSitePred = [&](AbstractCallSite ACS) {
+      const auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
+          *this, IRPosition::callsite_function(*ACS.getInstruction()),
+          DepClassTy::REQUIRED);
+      // Get the set of assumptions shared by all of this function's callers.
+      Changed |= getIntersection(AssumptionAA.getAssumed());
+      return !getAssumed().empty() || !getKnown().empty();
+    };
+
+    bool AllCallSitesKnown;
+    // Get the intersection of all assumptions held by this node's predecessors.
+    // If we don't know all the call sites then this is either an entry into the
+    // call graph or an empty node. This node is known to only contain its own
+    // assumptions and can be propagated to its successors.
+    if (!A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
 
-  /// Set of functions that are unreachable, but might become reachable.
-  SmallPtrSet<Function *, 8> UnreachableQueries;
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  void trackStatistics() const override {}
+};
+
+/// Assumption Info defined for call sites.
+struct AAAssumptionInfoCallSite final : AAAssumptionInfoImpl {
+
+  AAAssumptionInfoCallSite(const IRPosition &IRP, Attributor &A)
+      : AAAssumptionInfoImpl(IRP, A, getInitialAssumptions(IRP)) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    const IRPosition &FnPos = IRPosition::function(*getAnchorScope());
+    A.getAAFor<AAAssumptionInfo>(*this, FnPos, DepClassTy::REQUIRED);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // Don't manifest a universal set if it somehow made it here.
+    if (getKnown().isUniversal())
+      return ChangeStatus::UNCHANGED;
 
-  /// If we can reach a function with a call to a unknown function we assume
-  /// that we can reach any function.
-  bool CanReachUnknownCallee = false;
+    CallBase &AssociatedCall = cast<CallBase>(getAssociatedValue());
+    bool Changed = addAssumptions(AssociatedCall, getAssumed().getSet());
+
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const IRPosition &FnPos = IRPosition::function(*getAnchorScope());
+    auto &AssumptionAA =
+        A.getAAFor<AAAssumptionInfo>(*this, FnPos, DepClassTy::REQUIRED);
+    bool Changed = getIntersection(AssumptionAA.getAssumed());
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+private:
+  /// Helper to initialized the known set as all the assumptions this call and
+  /// the callee contain.
+  DenseSet<StringRef> getInitialAssumptions(const IRPosition &IRP) {
+    const CallBase &CB = cast<CallBase>(IRP.getAssociatedValue());
+    auto Assumptions = getAssumptions(CB);
+    if (Function *F = IRP.getAssociatedFunction())
+      set_union(Assumptions, getAssumptions(*F));
+    if (Function *F = IRP.getAssociatedFunction())
+      set_union(Assumptions, getAssumptions(*F));
+    return Assumptions;
+  }
 };
 
 } // namespace
@@ -9603,6 +9812,7 @@ const char AANoUndef::ID = 0;
 const char AACallEdges::ID = 0;
 const char AAFunctionReachability::ID = 0;
 const char AAPointerInfo::ID = 0;
+const char AAAssumptionInfo::ID = 0;
 
 // Macro magic to create the static generator function for attributes that
 // follow the naming scheme.
@@ -9704,6 +9914,8 @@ CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
 CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
 CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
 CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AACallEdges)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAssumptionInfo)
 
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
@@ -9723,7 +9935,6 @@ CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AACallEdges)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAFunctionReachability)
 
 CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index 8e81f4bad4af..178d3f41963e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -153,33 +153,30 @@ static bool mergeConstants(Module &M) {
   // were just merged.
   while (true) {
     // Find the canonical constants others will be merged with.
-    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-         GVI != E; ) {
-      GlobalVariable *GV = &*GVI++;
-
+    for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
       // If this GV is dead, remove it.
-      GV->removeDeadConstantUsers();
-      if (GV->use_empty() && GV->hasLocalLinkage()) {
-        GV->eraseFromParent();
+      GV.removeDeadConstantUsers();
+      if (GV.use_empty() && GV.hasLocalLinkage()) {
+        GV.eraseFromParent();
         ++ChangesMade;
         continue;
       }
 
-      if (isUnmergeableGlobal(GV, UsedGlobals))
+      if (isUnmergeableGlobal(&GV, UsedGlobals))
         continue;
 
       // This transformation is legal for weak ODR globals in the sense it
       // doesn't change semantics, but we really don't want to perform it
       // anyway; it's likely to pessimize code generation, and some tools
       // (like the Darwin linker in cases involving CFString) don't expect it.
-      if (GV->isWeakForLinker())
+      if (GV.isWeakForLinker())
         continue;
 
       // Don't touch globals with metadata other then !dbg.
-      if (hasMetadataOtherThanDebugLoc(GV))
+      if (hasMetadataOtherThanDebugLoc(&GV))
         continue;
 
-      Constant *Init = GV->getInitializer();
+      Constant *Init = GV.getInitializer();
 
       // Check to see if the initializer is already known.
       GlobalVariable *&Slot = CMap[Init];
@@ -188,9 +185,9 @@ static bool mergeConstants(Module &M) {
       // replace with the current one. If the current is externally visible
       // it cannot be replace, but can be the canonical constant we merge with.
       bool FirstConstantFound = !Slot;
-      if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) {
-        Slot = GV;
-        LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName()
+      if (FirstConstantFound || IsBetterCanonical(GV, *Slot)) {
+        Slot = &GV;
+        LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV.getName()
                           << (FirstConstantFound ? "\n" : " (updated)\n"));
       }
     }
@@ -199,18 +196,15 @@ static bool mergeConstants(Module &M) {
     // SameContentReplacements vector. We cannot do the replacement in this pass
     // because doing so may cause initializers of other globals to be rewritten,
     // invalidating the Constant* pointers in CMap.
-    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-         GVI != E; ) {
-      GlobalVariable *GV = &*GVI++;
-
-      if (isUnmergeableGlobal(GV, UsedGlobals))
+    for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
+      if (isUnmergeableGlobal(&GV, UsedGlobals))
         continue;
 
       // We can only replace constant with local linkage.
-      if (!GV->hasLocalLinkage())
+      if (!GV.hasLocalLinkage())
         continue;
 
-      Constant *Init = GV->getInitializer();
+      Constant *Init = GV.getInitializer();
 
       // Check to see if the initializer is already known.
       auto Found = CMap.find(Init);
@@ -218,16 +212,16 @@ static bool mergeConstants(Module &M) {
         continue;
 
       GlobalVariable *Slot = Found->second;
-      if (Slot == GV)
+      if (Slot == &GV)
         continue;
 
-      if (makeMergeable(GV, Slot) == CanMerge::No)
+      if (makeMergeable(&GV, Slot) == CanMerge::No)
         continue;
 
       // Make all uses of the duplicate constant use the canonical version.
-      LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @"
+      LLVM_DEBUG(dbgs() << "Will replace: @" << GV.getName() << " -> @"
                         << Slot->getName() << "\n");
-      SameContentReplacements.push_back(std::make_pair(GV, Slot));
+      SameContentReplacements.push_back(std::make_pair(&GV, Slot));
     }
 
     // Now that we have figured out which replacements must be made, do them all
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d95fd55870f8..fb9ab7954e36 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -175,8 +175,8 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   // to pass in a smaller number of arguments into the new function.
   //
   std::vector<Value *> Args;
-  for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
-    CallBase *CB = dyn_cast<CallBase>(*I++);
+  for (User *U : llvm::make_early_inc_range(Fn.users())) {
+    CallBase *CB = dyn_cast<CallBase>(U);
     if (!CB)
       continue;
 
@@ -188,9 +188,9 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     if (!PAL.isEmpty()) {
       SmallVector<AttributeSet, 8> ArgAttrs;
       for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
-        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
-      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
-                               PAL.getRetAttributes(), ArgAttrs);
+        ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttrs(),
+                               PAL.getRetAttrs(), ArgAttrs);
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -762,8 +762,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[ArgI] = true;
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
-      HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned);
+      ArgAttrVec.push_back(PAL.getParamAttrs(ArgI));
+      HasLiveReturnedArg |= PAL.hasParamAttr(ArgI, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
       LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
@@ -838,7 +838,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttrBuilder RAttrs(PAL.getRetAttributes());
+  AttrBuilder RAttrs(PAL.getRetAttrs());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
@@ -853,8 +853,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
   // Strip allocsize attributes. They might refer to the deleted arguments.
-  AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
-      F->getContext(), Attribute::AllocSize);
+  AttributeSet FnAttrs =
+      PAL.getFnAttrs().removeAttribute(F->getContext(), Attribute::AllocSize);
 
   // Reconstruct the AttributesList based on the vector we constructed.
   assert(ArgAttrVec.size() == Params.size());
@@ -889,7 +889,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Adjust the call return attributes in case the function was changed to
     // return void.
-    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    AttrBuilder RAttrs(CallPAL.getRetAttrs());
     RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
     AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
@@ -903,7 +903,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[Pi]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
+        AttributeSet Attrs = CallPAL.getParamAttrs(Pi);
         if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
@@ -922,7 +922,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
       Args.push_back(*I);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
+      ArgAttrVec.push_back(CallPAL.getParamAttrs(Pi));
     }
 
     // Reconstruct the AttributesList based on the vector we constructed.
@@ -930,7 +930,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Again, be sure to remove any allocsize attributes, since their indices
     // may now be incorrect.
-    AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+    AttributeSet FnAttrs = CallPAL.getFnAttrs().removeAttribute(
         F->getContext(), Attribute::AllocSize);
 
     AttributeList NewCallPAL = AttributeList::get(
@@ -1094,11 +1094,9 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
   // fused with the next loop, because deleting a function invalidates
   // information computed while surveying other functions.
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    Function &F = *I++;
+  for (Function &F : llvm::make_early_inc_range(M))
     if (F.getFunctionType()->isVarArg())
       Changed |= DeleteDeadVarargs(F);
-  }
 
   // Second phase:loop through the module, determining which arguments are live.
   // We assume all arguments are dead unless proven otherwise (allowing us to
@@ -1109,13 +1107,10 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
     SurveyFunction(F);
 
   // Now, remove all dead arguments and return values from each function in
-  // turn.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    // Increment now, because the function will probably get removed (ie.
-    // replaced by a new one).
-    Function *F = &*I++;
-    Changed |= RemoveDeadStuffFromFunction(F);
-  }
+  // turn.  We use make_early_inc_range here because functions will probably get
+  // removed (i.e. replaced by new ones).
+  for (Function &F : llvm::make_early_inc_range(M))
+    Changed |= RemoveDeadStuffFromFunction(&F);
 
   // Finally, look for any unused parameters in functions with non-local
   // linkage and replace the passed in parameters with undef.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp
index ba0efd46af16..387f114f6ffa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -121,32 +121,27 @@ namespace {
       }
 
       // Visit the Aliases.
-      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-           I != E;) {
-        Module::alias_iterator CurI = I;
-        ++I;
-
-        bool Delete = deleteStuff == (bool)Named.count(&*CurI);
-        makeVisible(*CurI, Delete);
+      for (GlobalAlias &GA : llvm::make_early_inc_range(M.aliases())) {
+        bool Delete = deleteStuff == (bool)Named.count(&GA);
+        makeVisible(GA, Delete);
 
         if (Delete) {
-          Type *Ty =  CurI->getValueType();
+          Type *Ty = GA.getValueType();
 
-          CurI->removeFromParent();
+          GA.removeFromParent();
           llvm::Value *Declaration;
           if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
-            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                           CurI->getAddressSpace(),
-                                           CurI->getName(), &M);
+            Declaration =
+                Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                 GA.getAddressSpace(), GA.getName(), &M);
 
           } else {
             Declaration =
-              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
-                                 nullptr, CurI->getName());
-
+                new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+                                   nullptr, GA.getName());
           }
-          CurI->replaceAllUsesWith(Declaration);
-          delete &*CurI;
+          GA.replaceAllUsesWith(Declaration);
+          delete &GA;
         }
       }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 47fdf042f9d4..16d00a0c89e1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -50,14 +50,14 @@ static void forceAttributes(Function &F) {
     return Kind;
   };
 
-  for (auto &S : ForceAttributes) {
+  for (const auto &S : ForceAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
     if (Kind == Attribute::None || F.hasFnAttribute(Kind))
       continue;
     F.addFnAttr(Kind);
   }
 
-  for (auto &S : ForceRemoveAttributes) {
+  for (const auto &S : ForceRemoveAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
     if (Kind == Attribute::None || !F.hasFnAttribute(Kind))
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index ca8660a98ded..cde78713b554 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -14,10 +14,12 @@
 
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -82,6 +84,11 @@ STATISTIC(NumNoFree, "Number of functions marked as nofree");
 STATISTIC(NumWillReturn, "Number of functions marked as willreturn");
 STATISTIC(NumNoSync, "Number of functions marked as nosync");
 
+STATISTIC(NumThinLinkNoRecurse,
+          "Number of functions marked as norecurse during thinlink");
+STATISTIC(NumThinLinkNoUnwind,
+          "Number of functions marked as nounwind during thinlink");
+
 static cl::opt<bool> EnableNonnullArgPropagation(
     "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
     cl::desc("Try to propagate nonnull argument attributes from callsites to "
@@ -95,6 +102,10 @@ static cl::opt<bool> DisableNoFreeInference(
     "disable-nofree-inference", cl::Hidden,
     cl::desc("Stop inferring nofree attribute during function-attrs pass"));
 
+static cl::opt<bool> DisableThinLTOPropagation(
+    "disable-thinlto-funcattrs", cl::init(true), cl::Hidden,
+    cl::desc("Don't propagate function-attrs in thinLTO"));
+
 namespace {
 
 using SCCNodeSet = SmallSetVector<Function *, 8>;
@@ -131,12 +142,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
   // Scan the function body for instructions that may read or write memory.
   bool ReadsMemory = false;
   bool WritesMemory = false;
-  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
-    Instruction *I = &*II;
-
+  for (Instruction &I : instructions(F)) {
     // Some instructions can be ignored even if they read or write memory.
     // Detect these now, skipping to the next instruction if one is found.
-    if (auto *Call = dyn_cast<CallBase>(I)) {
+    if (auto *Call = dyn_cast<CallBase>(&I)) {
       // Ignore calls to functions in the same SCC, as long as the call sites
       // don't have operand bundles.  Calls with operand bundles are allowed to
       // have memory effects not described by the memory effects of the call
@@ -170,14 +179,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
       // Check whether all pointer arguments point to local memory, and
       // ignore calls that only access local memory.
-      for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) {
-        Value *Arg = *CI;
+      for (const Use &U : Call->args()) {
+        const Value *Arg = U;
         if (!Arg->getType()->isPtrOrPtrVectorTy())
           continue;
 
-        AAMDNodes AAInfo;
-        I->getAAMetadata(AAInfo);
-        MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(Arg, AAInfo);
+        MemoryLocation Loc =
+            MemoryLocation::getBeforeOrAfter(Arg, I.getAAMetadata());
 
         // Skip accesses to local or constant memory as they don't impact the
         // externally visible mod/ref behavior.
@@ -192,21 +200,21 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
           ReadsMemory = true;
       }
       continue;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
       // Ignore non-volatile loads from local memory. (Atomic is okay here.)
       if (!LI->isVolatile()) {
         MemoryLocation Loc = MemoryLocation::get(LI);
         if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
           continue;
       }
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
       // Ignore non-volatile stores to local memory. (Atomic is okay here.)
       if (!SI->isVolatile()) {
         MemoryLocation Loc = MemoryLocation::get(SI);
         if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
           continue;
       }
-    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(&I)) {
       // Ignore vaargs on local memory.
       MemoryLocation Loc = MemoryLocation::get(VI);
       if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
@@ -217,10 +225,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
     // read or write memory.
     //
     // Writes memory, remember that.
-    WritesMemory |= I->mayWriteToMemory();
+    WritesMemory |= I.mayWriteToMemory();
 
     // If this instruction may read memory, remember that.
-    ReadsMemory |= I->mayReadFromMemory();
+    ReadsMemory |= I.mayReadFromMemory();
   }
 
   if (WritesMemory) { 
@@ -240,7 +248,8 @@ MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
 
 /// Deduce readonly/readnone attributes for the SCC.
 template <typename AARGetterT>
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
+static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
+                         SmallSet<Function *, 8> &Changed) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
@@ -255,7 +264,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
                                       AAR, SCCNodes)) {
     case MAK_MayWrite:
-      return false;
+      return;
     case MAK_ReadOnly:
       ReadsMemory = true;
       break;
@@ -271,11 +280,10 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
   // If the SCC contains both functions that read and functions that write, then
   // we cannot add readonly attributes.
   if (ReadsMemory && WritesMemory)
-    return false;
+    return;
 
   // Success!  Functions in this SCC do not access memory, or only read memory.
   // Give them the appropriate attribute.
-  bool MadeChange = false;
 
   for (Function *F : SCCNodes) {
     if (F->doesNotAccessMemory())
@@ -289,7 +297,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     if (F->doesNotReadMemory() && WritesMemory)
       continue;
 
-    MadeChange = true;
+    Changed.insert(F);
 
     // Clear out any existing attributes.
     AttrBuilder AttrsToRemove;
@@ -303,7 +311,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly);
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
     }
-    F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove);
+    F->removeFnAttrs(AttrsToRemove);
 
     // Add in the new attribute.
     if (WritesMemory && !ReadsMemory)
@@ -318,8 +326,195 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     else
       ++NumReadNone;
   }
+}
+
+// Compute definitive function attributes for a function taking into account
+// prevailing definitions and linkage types
+static FunctionSummary *calculatePrevailingSummary(
+    ValueInfo VI,
+    DenseMap<ValueInfo, FunctionSummary *> &CachedPrevailingSummary,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        IsPrevailing) {
+
+  if (CachedPrevailingSummary.count(VI))
+    return CachedPrevailingSummary[VI];
+
+  /// At this point, prevailing symbols have been resolved. The following leads
+  /// to returning a conservative result:
+  /// - Multiple instances with local linkage. Normally local linkage would be
+  ///   unique per module
+  ///   as the GUID includes the module path. We could have a guid alias if
+  ///   there wasn't any distinguishing path when each file was compiled, but
+  ///   that should be rare so we'll punt on those.
+
+  /// These next 2 cases should not happen and will assert:
+  /// - Multiple instances with external linkage. This should be caught in
+  ///   symbol resolution
+  /// - Non-existent FunctionSummary for Aliasee. This presents a hole in our
+  ///   knowledge meaning we have to go conservative.
+
+  /// Otherwise, we calculate attributes for a function as:
+  ///   1. If we have a local linkage, take its attributes. If there's somehow
+  ///      multiple, bail and go conservative.
+  ///   2. If we have an external/WeakODR/LinkOnceODR linkage check that it is
+  ///      prevailing, take its attributes.
+  ///   3. If we have a Weak/LinkOnce linkage the copies can have semantic
+  ///      differences. However, if the prevailing copy is known it will be used
+  ///      so take its attributes. If the prevailing copy is in a native file
+  ///      all IR copies will be dead and propagation will go conservative.
+  ///   4. AvailableExternally summaries without a prevailing copy are known to
+  ///      occur in a couple of circumstances:
+  ///      a. An internal function gets imported due to its caller getting
+  ///         imported, it becomes AvailableExternally but no prevailing
+  ///         definition exists. Because it has to get imported along with its
+  ///         caller the attributes will be captured by propagating on its
+  ///         caller.
+  ///      b. C++11 [temp.explicit]p10 can generate AvailableExternally
+  ///         definitions of explicitly instanced template declarations
+  ///         for inlining which are ultimately dropped from the TU. Since this
+  ///         is localized to the TU the attributes will have already made it to
+  ///         the callers.
+  ///      These are edge cases and already captured by their callers so we
+  ///      ignore these for now. If they become relevant to optimize in the
+  ///      future this can be revisited.
+  ///   5. Otherwise, go conservative.
+
+  CachedPrevailingSummary[VI] = nullptr;
+  FunctionSummary *Local = nullptr;
+  FunctionSummary *Prevailing = nullptr;
+
+  for (const auto &GVS : VI.getSummaryList()) {
+    if (!GVS->isLive())
+      continue;
+
+    FunctionSummary *FS = dyn_cast<FunctionSummary>(GVS->getBaseObject());
+    // Virtual and Unknown (e.g. indirect) calls require going conservative
+    if (!FS || FS->fflags().HasUnknownCall)
+      return nullptr;
+
+    const auto &Linkage = GVS->linkage();
+    if (GlobalValue::isLocalLinkage(Linkage)) {
+      if (Local) {
+        LLVM_DEBUG(
+            dbgs()
+            << "ThinLTO FunctionAttrs: Multiple Local Linkage, bailing on "
+               "function "
+            << VI.name() << " from " << FS->modulePath() << ". Previous module "
+            << Local->modulePath() << "\n");
+        return nullptr;
+      }
+      Local = FS;
+    } else if (GlobalValue::isExternalLinkage(Linkage)) {
+      assert(IsPrevailing(VI.getGUID(), GVS.get()));
+      Prevailing = FS;
+      break;
+    } else if (GlobalValue::isWeakODRLinkage(Linkage) ||
+               GlobalValue::isLinkOnceODRLinkage(Linkage) ||
+               GlobalValue::isWeakAnyLinkage(Linkage) ||
+               GlobalValue::isLinkOnceAnyLinkage(Linkage)) {
+      if (IsPrevailing(VI.getGUID(), GVS.get())) {
+        Prevailing = FS;
+        break;
+      }
+    } else if (GlobalValue::isAvailableExternallyLinkage(Linkage)) {
+      // TODO: Handle these cases if they become meaningful
+      continue;
+    }
+  }
+
+  if (Local) {
+    assert(!Prevailing);
+    CachedPrevailingSummary[VI] = Local;
+  } else if (Prevailing) {
+    assert(!Local);
+    CachedPrevailingSummary[VI] = Prevailing;
+  }
 
-  return MadeChange;
+  return CachedPrevailingSummary[VI];
+}
+
+bool llvm::thinLTOPropagateFunctionAttrs(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        IsPrevailing) {
+  // TODO: implement addNoAliasAttrs once
+  // there's more information about the return type in the summary
+  if (DisableThinLTOPropagation)
+    return false;
+
+  DenseMap<ValueInfo, FunctionSummary *> CachedPrevailingSummary;
+  bool Changed = false;
+
+  auto PropagateAttributes = [&](std::vector<ValueInfo> &SCCNodes) {
+    // Assume we can propagate unless we discover otherwise
+    FunctionSummary::FFlags InferredFlags;
+    InferredFlags.NoRecurse = (SCCNodes.size() == 1);
+    InferredFlags.NoUnwind = true;
+
+    for (auto &V : SCCNodes) {
+      FunctionSummary *CallerSummary =
+          calculatePrevailingSummary(V, CachedPrevailingSummary, IsPrevailing);
+
+      // Function summaries can fail to contain information such as declarations
+      if (!CallerSummary)
+        return;
+
+      if (CallerSummary->fflags().MayThrow)
+        InferredFlags.NoUnwind = false;
+
+      for (const auto &Callee : CallerSummary->calls()) {
+        FunctionSummary *CalleeSummary = calculatePrevailingSummary(
+            Callee.first, CachedPrevailingSummary, IsPrevailing);
+
+        if (!CalleeSummary)
+          return;
+
+        if (!CalleeSummary->fflags().NoRecurse)
+          InferredFlags.NoRecurse = false;
+
+        if (!CalleeSummary->fflags().NoUnwind)
+          InferredFlags.NoUnwind = false;
+
+        if (!InferredFlags.NoUnwind && !InferredFlags.NoRecurse)
+          break;
+      }
+    }
+
+    if (InferredFlags.NoUnwind || InferredFlags.NoRecurse) {
+      Changed = true;
+      for (auto &V : SCCNodes) {
+        if (InferredFlags.NoRecurse) {
+          LLVM_DEBUG(dbgs() << "ThinLTO FunctionAttrs: Propagated NoRecurse to "
+                            << V.name() << "\n");
+          ++NumThinLinkNoRecurse;
+        }
+
+        if (InferredFlags.NoUnwind) {
+          LLVM_DEBUG(dbgs() << "ThinLTO FunctionAttrs: Propagated NoUnwind to "
+                            << V.name() << "\n");
+          ++NumThinLinkNoUnwind;
+        }
+
+        for (auto &S : V.getSummaryList()) {
+          if (auto *FS = dyn_cast<FunctionSummary>(S.get())) {
+            if (InferredFlags.NoRecurse)
+              FS->setNoRecurse();
+
+            if (InferredFlags.NoUnwind)
+              FS->setNoUnwind();
+          }
+        }
+      }
+    }
+  };
+
+  // Call propagation functions on each SCC in the Index
+  for (scc_iterator<ModuleSummaryIndex *> I = scc_begin(&Index); !I.isAtEnd();
+       ++I) {
+    std::vector<ValueInfo> Nodes(*I);
+    PropagateAttributes(Nodes);
+  }
+  return Changed;
 }
 
 namespace {
@@ -395,7 +590,7 @@ struct ArgumentUsesTracker : public CaptureTracker {
     assert(UseIndex < CB->data_operands_size() &&
            "Indirect function calls should have been filtered above!");
 
-    if (UseIndex >= CB->getNumArgOperands()) {
+    if (UseIndex >= CB->arg_size()) {
       // Data operand, but not a argument operand -- must be a bundle operand
       assert(CB->hasOperandBundles() && "Must be!");
 
@@ -530,7 +725,7 @@ determinePointerReadAttrs(Argument *A,
       assert(UseIndex < CB.data_operands_size() &&
              "Data operand use expected!");
 
-      bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands();
+      bool IsOperandBundleUse = UseIndex >= CB.arg_size();
 
       if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
         assert(F->isVarArg() && "More params than args in non-varargs call");
@@ -581,9 +776,8 @@ determinePointerReadAttrs(Argument *A,
 }
 
 /// Deduce returned attributes for the SCC.
-static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes,
+                                     SmallSet<Function *, 8> &Changed) {
   // Check each function in turn, determining if an argument is always returned.
   for (Function *F : SCCNodes) {
     // We can infer and propagate function attributes only when we know that the
@@ -623,11 +817,9 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
       auto *A = cast<Argument>(RetArg);
       A->addAttr(Attribute::Returned);
       ++NumReturned;
-      Changed = true;
+      Changed.insert(F);
     }
   }
-
-  return Changed;
 }
 
 /// If a callsite has arguments that are also arguments to the parent function,
@@ -693,9 +885,8 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
 }
 
 /// Deduce nocapture attributes for the SCC.
-static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
+                             SmallSet<Function *, 8> &Changed) {
   ArgumentGraph AG;
 
   // Check each function in turn, determining which pointer arguments are not
@@ -707,7 +898,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (!F->hasExactDefinition())
       continue;
 
-    Changed |= addArgumentAttrsFromCallsites(*F);
+    if (addArgumentAttrsFromCallsites(*F))
+      Changed.insert(F);
 
     // Functions that are readonly (or readnone) and nounwind and don't return
     // a value can't capture arguments. Don't analyze them.
@@ -718,7 +910,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
           A->addAttr(Attribute::NoCapture);
           ++NumNoCapture;
-          Changed = true;
+          Changed.insert(F);
         }
       }
       continue;
@@ -737,7 +929,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
             // If it's trivially not captured, mark it nocapture now.
             A->addAttr(Attribute::NoCapture);
             ++NumNoCapture;
-            Changed = true;
+            Changed.insert(F);
           } else {
             // If it's not trivially captured and not trivially not captured,
             // then it must be calling into another function in our SCC. Save
@@ -761,7 +953,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         Self.insert(&*A);
         Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
         if (R != Attribute::None)
-          Changed = addReadAttr(A, R);
+          if (addReadAttr(A, R))
+            Changed.insert(F);
       }
     }
   }
@@ -785,7 +978,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         Argument *A = ArgumentSCC[0]->Definition;
         A->addAttr(Attribute::NoCapture);
         ++NumNoCapture;
-        Changed = true;
+        Changed.insert(A->getParent());
       }
       continue;
     }
@@ -827,7 +1020,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       Argument *A = ArgumentSCC[i]->Definition;
       A->addAttr(Attribute::NoCapture);
       ++NumNoCapture;
-      Changed = true;
+      Changed.insert(A->getParent());
     }
 
     // We also want to compute readonly/readnone. With a small number of false
@@ -858,12 +1051,11 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (ReadAttr != Attribute::None) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
-        Changed = addReadAttr(A, ReadAttr);
+        if (addReadAttr(A, ReadAttr))
+          Changed.insert(A->getParent());
       }
     }
   }
-
-  return Changed;
 }
 
 /// Tests whether a function is "malloc-like".
@@ -934,7 +1126,8 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
 }
 
 /// Deduce noalias attributes for the SCC.
-static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
+static void addNoAliasAttrs(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
   // Check each function in turn, determining which functions return noalias
   // pointers.
   for (Function *F : SCCNodes) {
@@ -946,7 +1139,7 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
     // definition we'll get at link time is *exactly* the definition we see now.
     // For more details, see GlobalValue::mayBeDerefined.
     if (!F->hasExactDefinition())
-      return false;
+      return;
 
     // We annotate noalias return values, which are only applicable to
     // pointer types.
@@ -954,10 +1147,9 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
       continue;
 
     if (!isFunctionMallocLike(F, SCCNodes))
-      return false;
+      return;
   }
 
-  bool MadeChange = false;
   for (Function *F : SCCNodes) {
     if (F->returnDoesNotAlias() ||
         !F->getReturnType()->isPointerTy())
@@ -965,10 +1157,8 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
 
     F->setReturnDoesNotAlias();
     ++NumNoAlias;
-    MadeChange = true;
+    Changed.insert(F);
   }
-
-  return MadeChange;
 }
 
 /// Tests whether this function is known to not return null.
@@ -1044,26 +1234,24 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
 }
 
 /// Deduce nonnull attributes for the SCC.
-static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
+static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
   // Speculative that all functions in the SCC return only nonnull
   // pointers.  We may refute this as we analyze functions.
   bool SCCReturnsNonNull = true;
 
-  bool MadeChange = false;
-
   // Check each function in turn, determining which functions return nonnull
   // pointers.
   for (Function *F : SCCNodes) {
     // Already nonnull.
-    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                        Attribute::NonNull))
+    if (F->getAttributes().hasRetAttr(Attribute::NonNull))
       continue;
 
     // We can infer and propagate function attributes only when we know that the
     // definition we'll get at link time is *exactly* the definition we see now.
     // For more details, see GlobalValue::mayBeDerefined.
     if (!F->hasExactDefinition())
-      return false;
+      return;
 
     // We annotate nonnull return values, which are only applicable to
     // pointer types.
@@ -1077,9 +1265,9 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
         // which prevents us from speculating about the entire SCC
         LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
                           << " as nonnull\n");
-        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        F->addRetAttr(Attribute::NonNull);
         ++NumNonNullReturn;
-        MadeChange = true;
+        Changed.insert(F);
       }
       continue;
     }
@@ -1090,19 +1278,16 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
 
   if (SCCReturnsNonNull) {
     for (Function *F : SCCNodes) {
-      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                          Attribute::NonNull) ||
+      if (F->getAttributes().hasRetAttr(Attribute::NonNull) ||
           !F->getReturnType()->isPointerTy())
         continue;
 
       LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
-      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+      F->addRetAttr(Attribute::NonNull);
       ++NumNonNullReturn;
-      MadeChange = true;
+      Changed.insert(F);
     }
   }
-
-  return MadeChange;
 }
 
 namespace {
@@ -1155,12 +1340,13 @@ public:
     InferenceDescriptors.push_back(AttrInference);
   }
 
-  bool run(const SCCNodeSet &SCCNodes);
+  void run(const SCCNodeSet &SCCNodes, SmallSet<Function *, 8> &Changed);
 };
 
 /// Perform all the requested attribute inference actions according to the
 /// attribute predicates stored before.
-bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
+void AttributeInferer::run(const SCCNodeSet &SCCNodes,
+                           SmallSet<Function *, 8> &Changed) {
   SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
   // Go through all the functions in SCC and check corresponding attribute
   // assumptions for each of them. Attributes that are invalid for this SCC
@@ -1169,7 +1355,7 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
 
     // No attributes whose assumptions are still valid - done.
     if (InferInSCC.empty())
-      return false;
+      return;
 
     // Check if our attributes ever need scanning/can be scanned.
     llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
@@ -1212,9 +1398,8 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
   }
 
   if (InferInSCC.empty())
-    return false;
+    return;
 
-  bool Changed = false;
   for (Function *F : SCCNodes)
     // At this point InferInSCC contains only functions that were either:
     //   - explicitly skipped from scan/inference, or
@@ -1223,10 +1408,9 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
     for (auto &ID : InferInSCC) {
       if (ID.SkipFunction(*F))
         continue;
-      Changed = true;
+      Changed.insert(F);
       ID.SetAttribute(*F);
     }
-  return Changed;
 }
 
 struct SCCNodesResult {
@@ -1243,7 +1427,7 @@ static bool InstrBreaksNonConvergent(Instruction &I,
   // Breaks non-convergent assumption if CS is a convergent call to a function
   // not in the SCC.
   return CB && CB->isConvergent() &&
-         SCCNodes.count(CB->getCalledFunction()) == 0;
+         !SCCNodes.contains(CB->getCalledFunction());
 }
 
 /// Helper for NoUnwind inference predicate InstrBreaksAttribute.
@@ -1282,7 +1466,8 @@ static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
 /// Attempt to remove convergent function attribute when possible.
 ///
 /// Returns true if any changes to function attributes were made.
-static bool inferConvergent(const SCCNodeSet &SCCNodes) {
+static void inferConvergent(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
   AttributeInferer AI;
 
   // Request to remove the convergent attribute from all functions in the SCC
@@ -1305,7 +1490,7 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) {
       },
       /* RequiresExactDefinition= */ false});
   // Perform all the requested attribute inference actions.
-  return AI.run(SCCNodes);
+  AI.run(SCCNodes, Changed);
 }
 
 /// Infer attributes from all functions in the SCC by scanning every
@@ -1314,7 +1499,8 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) {
 ///   - addition of NoUnwind attribute
 ///
 /// Returns true if any changes to function attributes were made.
-static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
+static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
+                                         SmallSet<Function *, 8> &Changed) {
   AttributeInferer AI;
 
   if (!DisableNoUnwindInference)
@@ -1363,19 +1549,20 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
         /* RequiresExactDefinition= */ true});
 
   // Perform all the requested attribute inference actions.
-  return AI.run(SCCNodes);
+  AI.run(SCCNodes, Changed);
 }
 
-static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
+static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
+                              SmallSet<Function *, 8> &Changed) {
   // Try and identify functions that do not recurse.
 
   // If the SCC contains multiple nodes we know for sure there is recursion.
   if (SCCNodes.size() != 1)
-    return false;
+    return;
 
   Function *F = *SCCNodes.begin();
   if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
-    return false;
+    return;
 
   // If all of the calls in F are identifiable and are to norecurse functions, F
   // is norecurse. This check also detects self-recursion as F is not currently
@@ -1386,7 +1573,7 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
         Function *Callee = CB->getCalledFunction();
         if (!Callee || Callee == F || !Callee->doesNotRecurse())
           // Function calls a potentially recursive function.
-          return false;
+          return;
       }
 
   // Every call was to a non-recursive function other than this function, and
@@ -1394,7 +1581,7 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // recurse.
   F->setDoesNotRecurse();
   ++NumNoRecurse;
-  return true;
+  Changed.insert(F);
 }
 
 static bool instructionDoesNotReturn(Instruction &I) {
@@ -1412,9 +1599,8 @@ static bool basicBlockCanReturn(BasicBlock &BB) {
 }
 
 // Set the noreturn function attribute if possible.
-static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
+                             SmallSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
         F->doesNotReturn())
@@ -1424,11 +1610,9 @@ static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) {
     // FIXME: this doesn't handle recursion or unreachable blocks.
     if (none_of(*F, basicBlockCanReturn)) {
       F->setDoesNotReturn();
-      Changed = true;
+      Changed.insert(F);
     }
   }
-
-  return Changed;
 }
 
 static bool functionWillReturn(const Function &F) {
@@ -1461,19 +1645,16 @@ static bool functionWillReturn(const Function &F) {
 }
 
 // Set the willreturn function attribute if possible.
-static bool addWillReturn(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addWillReturn(const SCCNodeSet &SCCNodes,
+                          SmallSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || F->willReturn() || !functionWillReturn(*F))
       continue;
 
     F->setWillReturn();
     NumWillReturn++;
-    Changed = true;
+    Changed.insert(F);
   }
-
-  return Changed;
 }
 
 // Return true if this is an atomic which has an ordering stronger than
@@ -1532,7 +1713,8 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) {
 }
 
 // Infer the nosync attribute.
-static bool addNoSyncAttr(const SCCNodeSet &SCCNodes) {
+static void addNoSyncAttr(const SCCNodeSet &SCCNodes,
+                          SmallSet<Function *, 8> &Changed) {
   AttributeInferer AI;
   AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
       Attribute::NoSync,
@@ -1549,14 +1731,15 @@ static bool addNoSyncAttr(const SCCNodeSet &SCCNodes) {
         ++NumNoSync;
       },
       /* RequiresExactDefinition= */ true});
-  return AI.run(SCCNodes);
+  AI.run(SCCNodes, Changed);
 }
 
 static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
   SCCNodesResult Res;
   Res.HasUnknownCall = false;
   for (Function *F : Functions) {
-    if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked)) {
+    if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) ||
+        F->isPresplitCoroutine()) {
       // Treat any function we're trying not to optimize as if it were an
       // indirect call and omit it from the node set used below.
       Res.HasUnknownCall = true;
@@ -1582,32 +1765,33 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
 }
 
 template <typename AARGetterT>
-static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
-                                   AARGetterT &&AARGetter) {
+static SmallSet<Function *, 8>
+deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter) {
   SCCNodesResult Nodes = createSCCNodeSet(Functions);
-  bool Changed = false;
 
   // Bail if the SCC only contains optnone functions.
   if (Nodes.SCCNodes.empty())
-    return Changed;
+    return {};
+
+  SmallSet<Function *, 8> Changed;
 
-  Changed |= addArgumentReturnedAttrs(Nodes.SCCNodes);
-  Changed |= addReadAttrs(Nodes.SCCNodes, AARGetter);
-  Changed |= addArgumentAttrs(Nodes.SCCNodes);
-  Changed |= inferConvergent(Nodes.SCCNodes);
-  Changed |= addNoReturnAttrs(Nodes.SCCNodes);
-  Changed |= addWillReturn(Nodes.SCCNodes);
+  addArgumentReturnedAttrs(Nodes.SCCNodes, Changed);
+  addReadAttrs(Nodes.SCCNodes, AARGetter, Changed);
+  addArgumentAttrs(Nodes.SCCNodes, Changed);
+  inferConvergent(Nodes.SCCNodes, Changed);
+  addNoReturnAttrs(Nodes.SCCNodes, Changed);
+  addWillReturn(Nodes.SCCNodes, Changed);
 
   // If we have no external nodes participating in the SCC, we can deduce some
   // more precise attributes as well.
   if (!Nodes.HasUnknownCall) {
-    Changed |= addNoAliasAttrs(Nodes.SCCNodes);
-    Changed |= addNonNullAttrs(Nodes.SCCNodes);
-    Changed |= inferAttrsFromFunctionBodies(Nodes.SCCNodes);
-    Changed |= addNoRecurseAttrs(Nodes.SCCNodes);
+    addNoAliasAttrs(Nodes.SCCNodes, Changed);
+    addNonNullAttrs(Nodes.SCCNodes, Changed);
+    inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
+    addNoRecurseAttrs(Nodes.SCCNodes, Changed);
   }
 
-  Changed |= addNoSyncAttr(Nodes.SCCNodes);
+  addNoSyncAttr(Nodes.SCCNodes, Changed);
 
   // Finally, infer the maximal set of attributes from the ones we've inferred
   // above.  This is handling the cases where one attribute on a signature
@@ -1615,7 +1799,8 @@ static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
   // the later is missing (or simply less sophisticated).
   for (Function *F : Nodes.SCCNodes)
     if (F)
-      Changed |= inferAttributesFromOthers(*F);
+      if (inferAttributesFromOthers(*F))
+        Changed.insert(F);
 
   return Changed;
 }
@@ -1638,14 +1823,35 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
     Functions.push_back(&N.getFunction());
   }
 
-  if (deriveAttrsInPostOrder(Functions, AARGetter)) {
-    // We have not changed the call graph or removed/added functions.
-    PreservedAnalyses PA;
-    PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
-    return PA;
+  auto ChangedFunctions = deriveAttrsInPostOrder(Functions, AARGetter);
+  if (ChangedFunctions.empty())
+    return PreservedAnalyses::all();
+
+  // Invalidate analyses for modified functions so that we don't have to
+  // invalidate all analyses for all functions in this SCC.
+  PreservedAnalyses FuncPA;
+  // We haven't changed the CFG for modified functions.
+  FuncPA.preserveSet<CFGAnalyses>();
+  for (Function *Changed : ChangedFunctions) {
+    FAM.invalidate(*Changed, FuncPA);
+    // Also invalidate any direct callers of changed functions since analyses
+    // may care about attributes of direct callees. For example, MemorySSA cares
+    // about whether or not a call's callee modifies memory and queries that
+    // through function attributes.
+    for (auto *U : Changed->users()) {
+      if (auto *Call = dyn_cast<CallBase>(U)) {
+        if (Call->getCalledFunction() == Changed)
+          FAM.invalidate(*Call->getFunction(), FuncPA);
+      }
+    }
   }
 
-  return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  // We have not added or removed functions.
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  // We already invalidated all relevant function analyses above.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  return PA;
 }
 
 namespace {
@@ -1690,7 +1896,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
     Functions.push_back(I->getFunction());
   }
 
-  return deriveAttrsInPostOrder(Functions, AARGetter);
+  return !deriveAttrsInPostOrder(Functions, AARGetter).empty();
 }
 
 bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 2f6cf0ca7087..d9b43109f629 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -187,23 +188,6 @@ selectCallee(const ModuleSummaryIndex &Index,
           return false;
         }
 
-        // For SamplePGO, in computeImportForFunction the OriginalId
-        // may have been used to locate the callee summary list (See
-        // comment there).
-        // The mapping from OriginalId to GUID may return a GUID
-        // that corresponds to a static variable. Filter it out here.
-        // This can happen when
-        // 1) There is a call to a library function which is not defined
-        // in the index.
-        // 2) There is a static variable with the  OriginalGUID identical
-        // to the GUID of the library function in 1);
-        // When this happens, the logic for SamplePGO kicks in and
-        // the static variable in 2) will be found, which needs to be
-        // filtered out.
-        if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) {
-          Reason = FunctionImporter::ImportFailureReason::GlobalVar;
-          return false;
-        }
         if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) {
           Reason = FunctionImporter::ImportFailureReason::InterposableLinkage;
           // There is no point in importing these, we can't inline them
@@ -264,21 +248,6 @@ using EdgeInfo =
 
 } // anonymous namespace
 
-static ValueInfo
-updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
-  if (!VI.getSummaryList().empty())
-    return VI;
-  // For SamplePGO, the indirect call targets for local functions will
-  // have its original name annotated in profile. We try to find the
-  // corresponding PGOFuncName as the GUID.
-  // FIXME: Consider updating the edges in the graph after building
-  // it, rather than needing to perform this mapping on each walk.
-  auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
-  if (GUID == 0)
-    return ValueInfo();
-  return Index.getValueInfo(GUID);
-}
-
 static bool shouldImportGlobal(const ValueInfo &VI,
                                const GVSummaryMapTy &DefinedGVSummaries) {
   const auto &GVS = DefinedGVSummaries.find(VI.getGUID());
@@ -400,10 +369,6 @@ static void computeImportForFunction(
       continue;
     }
 
-    VI = updateValueInfoForIndirectCalls(Index, VI);
-    if (!VI)
-      continue;
-
     if (DefinedGVSummaries.count(VI.getGUID())) {
       // FIXME: Consider not skipping import if the module contains
       // a non-prevailing def with interposable linkage. The prevailing copy
@@ -496,7 +461,7 @@ static void computeImportForFunction(
                             VI.name().str() + " due to " +
                             getFailureName(Reason);
           auto Error = make_error<StringError>(
-              Msg, std::make_error_code(std::errc::operation_not_supported));
+              Msg, make_error_code(errc::not_supported));
           logAllUnhandledErrors(std::move(Error), errs(),
                                 "Error importing module: ");
           break;
@@ -839,16 +804,61 @@ void llvm::ComputeCrossModuleImportForModuleFromIndex(
 #endif
 }
 
-void llvm::computeDeadSymbols(
+// For SamplePGO, the indirect call targets for local functions will
+// have its original name annotated in profile. We try to find the
+// corresponding PGOFuncName as the GUID, and fix up the edges
+// accordingly.
+void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index,
+                                     FunctionSummary *FS) {
+  for (auto &EI : FS->mutableCalls()) {
+    if (!EI.first.getSummaryList().empty())
+      continue;
+    auto GUID = Index.getGUIDFromOriginalID(EI.first.getGUID());
+    if (GUID == 0)
+      continue;
+    // Update the edge to point directly to the correct GUID.
+    auto VI = Index.getValueInfo(GUID);
+    if (llvm::any_of(
+            VI.getSummaryList(),
+            [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
+              // The mapping from OriginalId to GUID may return a GUID
+              // that corresponds to a static variable. Filter it out here.
+              // This can happen when
+              // 1) There is a call to a library function which is not defined
+              // in the index.
+              // 2) There is a static variable with the  OriginalGUID identical
+              // to the GUID of the library function in 1);
+              // When this happens the static variable in 2) will be found,
+              // which needs to be filtered out.
+              return SummaryPtr->getSummaryKind() ==
+                     GlobalValueSummary::GlobalVarKind;
+            }))
+      continue;
+    EI.first = VI;
+  }
+}
+
+void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) {
+  for (const auto &Entry : Index) {
+    for (auto &S : Entry.second.SummaryList) {
+      if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
+        updateValueInfoForIndirectCalls(Index, FS);
+    }
+  }
+}
+
+void llvm::computeDeadSymbolsAndUpdateIndirectCalls(
     ModuleSummaryIndex &Index,
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
   assert(!Index.withGlobalValueDeadStripping());
-  if (!ComputeDead)
-    return;
-  if (GUIDPreservedSymbols.empty())
-    // Don't do anything when nothing is live, this is friendly with tests.
+  if (!ComputeDead ||
+      // Don't do anything when nothing is live, this is friendly with tests.
+      GUIDPreservedSymbols.empty()) {
+    // Still need to update indirect calls.
+    updateIndirectCalls(Index);
     return;
+  }
   unsigned LiveSymbols = 0;
   SmallVector<ValueInfo, 128> Worklist;
   Worklist.reserve(GUIDPreservedSymbols.size() * 2);
@@ -863,13 +873,16 @@ void llvm::computeDeadSymbols(
   // Add values flagged in the index as live roots to the worklist.
   for (const auto &Entry : Index) {
     auto VI = Index.getValueInfo(Entry);
-    for (auto &S : Entry.second.SummaryList)
+    for (auto &S : Entry.second.SummaryList) {
+      if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
+        updateValueInfoForIndirectCalls(Index, FS);
       if (S->isLive()) {
         LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
         Worklist.push_back(VI);
         ++LiveSymbols;
         break;
       }
+    }
   }
 
   // Make value live and add it to the worklist if it was not live before.
@@ -882,9 +895,6 @@ void llvm::computeDeadSymbols(
     // binary, which increases the binary size unnecessarily. Note that
     // if this code changes, the importer needs to change so that edges
     // to functions marked dead are skipped.
-    VI = updateValueInfoForIndirectCalls(Index, VI);
-    if (!VI)
-      return;
 
     if (llvm::any_of(VI.getSummaryList(),
                      [](const std::unique_ptr<llvm::GlobalValueSummary> &S) {
@@ -958,7 +968,8 @@ void llvm::computeDeadSymbolsWithConstProp(
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
     bool ImportEnabled) {
-  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+  computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols,
+                                           isPrevailing);
   if (ImportEnabled)
     Index.propagateAttributes(GUIDPreservedSymbols);
 }
@@ -1040,13 +1051,33 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
   return true;
 }
 
-void llvm::thinLTOResolvePrevailingInModule(
-    Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
-  auto updateLinkage = [&](GlobalValue &GV) {
+void llvm::thinLTOFinalizeInModule(Module &TheModule,
+                                   const GVSummaryMapTy &DefinedGlobals,
+                                   bool PropagateAttrs) {
+  auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) {
     // See if the global summary analysis computed a new resolved linkage.
     const auto &GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end())
       return;
+
+    if (Propagate)
+      if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GS->second)) {
+        if (Function *F = dyn_cast<Function>(&GV)) {
+          // TODO: propagate ReadNone and ReadOnly.
+          if (FS->fflags().ReadNone && !F->doesNotAccessMemory())
+            F->setDoesNotAccessMemory();
+
+          if (FS->fflags().ReadOnly && !F->onlyReadsMemory())
+            F->setOnlyReadsMemory();
+
+          if (FS->fflags().NoRecurse && !F->doesNotRecurse())
+            F->setDoesNotRecurse();
+
+          if (FS->fflags().NoUnwind && !F->doesNotThrow())
+            F->setDoesNotThrow();
+        }
+      }
+
     auto NewLinkage = GS->second->linkage();
     if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
         // Don't internalize anything here, because the code below
@@ -1105,11 +1136,11 @@ void llvm::thinLTOResolvePrevailingInModule(
 
   // Process functions and global now
   for (auto &GV : TheModule)
-    updateLinkage(GV);
+    FinalizeInModule(GV, PropagateAttrs);
   for (auto &GV : TheModule.globals())
-    updateLinkage(GV);
+    FinalizeInModule(GV);
   for (auto &GV : TheModule.aliases())
-    updateLinkage(GV);
+    FinalizeInModule(GV);
 }
 
 /// Run internalization on \p TheModule based on symmary analysis.
@@ -1153,7 +1184,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
 
 /// Make alias a clone of its aliasee.
 static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
-  Function *Fn = cast<Function>(GA->getBaseObject());
+  Function *Fn = cast<Function>(GA->getAliaseeObject());
 
   ValueToValueMapTy VMap;
   Function *NewFn = CloneFunction(Fn, VMap);
@@ -1259,12 +1290,12 @@ Expected<bool> FunctionImporter::importFunctions(
         if (Error Err = GA.materialize())
           return std::move(Err);
         // Import alias as a copy of its aliasee.
-        GlobalObject *Base = GA.getBaseObject();
-        if (Error Err = Base->materialize())
+        GlobalObject *GO = GA.getAliaseeObject();
+        if (Error Err = GO->materialize())
           return std::move(Err);
         auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
-        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
-                          << " " << Base->getName() << " from "
+        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " "
+                          << GO->getName() << " from "
                           << SrcModule->getSourceFileName() << "\n");
         if (EnableImportMetadata) {
           // Add 'thinlto_src_module' metadata for statistics and debugging.
@@ -1303,7 +1334,7 @@ Expected<bool> FunctionImporter::importFunctions(
             std::move(SrcModule), GlobalsToImport.getArrayRef(),
             [](GlobalValue &, IRMover::ValueAdder) {},
             /*IsPerformingImport=*/true))
-      report_fatal_error("Function Import: link error: " +
+      report_fatal_error(Twine("Function Import: link error: ") +
                          toString(std::move(Err)));
 
     ImportedCount += GlobalsToImport.size();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index f61f4312b777..fbd083bb9bbf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -11,7 +11,6 @@
 // are propagated to the callee by specializing the function.
 //
 // Current limitations:
-// - It does not handle specialization of recursive functions,
 // - It does not yet handle integer ranges.
 // - Only 1 argument per function is specialised,
 // - The cost-model could be further looked into,
@@ -22,6 +21,18 @@
 //   a direct way to steer function specialization, avoiding the cost-model,
 //   and thus control compile-times / code-size.
 //
+// Todos:
+// - Specializing recursive functions relies on running the transformation a
+//   number of times, which is controlled by option
+//   `func-specialization-max-iters`. Thus, increasing this value and the
+//   number of iterations, will linearly increase the number of times recursive
+//   functions get specialized, see also the discussion in
+//   https://reviews.llvm.org/D106426 for details. Perhaps there is a
+//   compile-time friendlier way to control/limit the number of specialisations
+//   for recursive functions.
+// - Don't transform the function if there is no function specialization
+//   happens.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
@@ -59,20 +70,166 @@ static cl::opt<unsigned> MaxConstantsThreshold(
              "specialization"),
     cl::init(3));
 
+static cl::opt<unsigned> SmallFunctionThreshold(
+    "func-specialization-size-threshold", cl::Hidden,
+    cl::desc("Don't specialize functions that have less than this theshold "
+             "number of instructions"),
+    cl::init(100));
+
 static cl::opt<unsigned>
     AvgLoopIterationCount("func-specialization-avg-iters-cost", cl::Hidden,
                           cl::desc("Average loop iteration count cost"),
                           cl::init(10));
 
+static cl::opt<bool> SpecializeOnAddresses(
+    "func-specialization-on-address", cl::init(false), cl::Hidden,
+    cl::desc("Enable function specialization on the address of global values"));
+
+// TODO: This needs checking to see the impact on compile-times, which is why
+// this is off by default for now.
 static cl::opt<bool> EnableSpecializationForLiteralConstant(
     "function-specialization-for-literal-constant", cl::init(false), cl::Hidden,
-    cl::desc("Make function specialization available for literal constant."));
+    cl::desc("Enable specialization of functions that take a literal constant "
+             "as an argument."));
+
+// Helper to check if \p LV is either a constant or a constant
+// range with a single element. This should cover exactly the same cases as the
+// old ValueLatticeElement::isConstant() and is intended to be used in the
+// transition to ValueLatticeElement.
+static bool isConstant(const ValueLatticeElement &LV) {
+  return LV.isConstant() ||
+         (LV.isConstantRange() && LV.getConstantRange().isSingleElement());
+}
 
 // Helper to check if \p LV is either overdefined or a constant int.
 static bool isOverdefined(const ValueLatticeElement &LV) {
-  return !LV.isUnknownOrUndef() && !LV.isConstant();
+  return !LV.isUnknownOrUndef() && !isConstant(LV);
+}
+
+static Constant *getPromotableAlloca(AllocaInst *Alloca, CallInst *Call) {
+  Value *StoreValue = nullptr;
+  for (auto *User : Alloca->users()) {
+    // We can't use llvm::isAllocaPromotable() as that would fail because of
+    // the usage in the CallInst, which is what we check here.
+    if (User == Call)
+      continue;
+    if (auto *Bitcast = dyn_cast<BitCastInst>(User)) {
+      if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call)
+        return nullptr;
+      continue;
+    }
+
+    if (auto *Store = dyn_cast<StoreInst>(User)) {
+      // This is a duplicate store, bail out.
+      if (StoreValue || Store->isVolatile())
+        return nullptr;
+      StoreValue = Store->getValueOperand();
+      continue;
+    }
+    // Bail if there is any other unknown usage.
+    return nullptr;
+  }
+  return dyn_cast_or_null<Constant>(StoreValue);
 }
 
+// A constant stack value is an AllocaInst that has a single constant
+// value stored to it. Return this constant if such an alloca stack value
+// is a function argument.
+static Constant *getConstantStackValue(CallInst *Call, Value *Val,
+                                       SCCPSolver &Solver) {
+  if (!Val)
+    return nullptr;
+  Val = Val->stripPointerCasts();
+  if (auto *ConstVal = dyn_cast<ConstantInt>(Val))
+    return ConstVal;
+  auto *Alloca = dyn_cast<AllocaInst>(Val);
+  if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy())
+    return nullptr;
+  return getPromotableAlloca(Alloca, Call);
+}
+
+// To support specializing recursive functions, it is important to propagate
+// constant arguments because after a first iteration of specialisation, a
+// reduced example may look like this:
+//
+//     define internal void @RecursiveFn(i32* arg1) {
+//       %temp = alloca i32, align 4
+//       store i32 2 i32* %temp, align 4
+//       call void @RecursiveFn.1(i32* nonnull %temp)
+//       ret void
+//     }
+//
+// Before a next iteration, we need to propagate the constant like so
+// which allows further specialization in next iterations.
+//
+//     @funcspec.arg = internal constant i32 2
+//
+//     define internal void @someFunc(i32* arg1) {
+//       call void @otherFunc(i32* nonnull @funcspec.arg)
+//       ret void
+//     }
+//
+static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList,
+                                   Module &M, SCCPSolver &Solver) {
+  // Iterate over the argument tracked functions see if there
+  // are any new constant values for the call instruction via
+  // stack variables.
+  for (auto *F : WorkList) {
+    // TODO: Generalize for any read only arguments.
+    if (F->arg_size() != 1)
+      continue;
+
+    auto &Arg = *F->arg_begin();
+    if (!Arg.onlyReadsMemory() || !Arg.getType()->isPointerTy())
+      continue;
+
+    for (auto *User : F->users()) {
+      auto *Call = dyn_cast<CallInst>(User);
+      if (!Call)
+        break;
+      auto *ArgOp = Call->getArgOperand(0);
+      auto *ArgOpType = ArgOp->getType();
+      auto *ConstVal = getConstantStackValue(Call, ArgOp, Solver);
+      if (!ConstVal)
+        break;
+
+      Value *GV = new GlobalVariable(M, ConstVal->getType(), true,
+                                     GlobalValue::InternalLinkage, ConstVal,
+                                     "funcspec.arg");
+
+      if (ArgOpType != ConstVal->getType())
+        GV = ConstantExpr::getBitCast(cast<Constant>(GV), ArgOp->getType());
+
+      Call->setArgOperand(0, GV);
+
+      // Add the changed CallInst to Solver Worklist
+      Solver.visitCall(*Call);
+    }
+  }
+}
+
+// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics
+// interfere with the constantArgPropagation optimization.
+static void removeSSACopy(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
+      auto *II = dyn_cast<IntrinsicInst>(&Inst);
+      if (!II)
+        continue;
+      if (II->getIntrinsicID() != Intrinsic::ssa_copy)
+        continue;
+      Inst.replaceAllUsesWith(II->getOperand(0));
+      Inst.eraseFromParent();
+    }
+  }
+}
+
+static void removeSSACopy(Module &M) {
+  for (Function &F : M)
+    removeSSACopy(F);
+}
+
+namespace {
 class FunctionSpecializer {
 
   /// The IPSCCP Solver.
@@ -115,9 +272,14 @@ public:
     for (auto *SpecializedFunc : CurrentSpecializations) {
       SpecializedFuncs.insert(SpecializedFunc);
 
-      // TODO: If we want to support specializing specialized functions,
-      // initialize here the state of the newly created functions, marking
-      // them argument-tracked and executable.
+      // Initialize the state of the newly created functions, marking them
+      // argument-tracked and executable.
+      if (SpecializedFunc->hasExactDefinition() &&
+          !SpecializedFunc->hasFnAttribute(Attribute::Naked))
+        Solver.addTrackedFunction(SpecializedFunc);
+      Solver.addArgumentTrackedFunction(SpecializedFunc);
+      FuncDecls.push_back(SpecializedFunc);
+      Solver.markBlockExecutable(&SpecializedFunc->front());
 
       // Replace the function arguments for the specialized functions.
       for (Argument &Arg : SpecializedFunc->args())
@@ -138,12 +300,22 @@ public:
     const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
     if (isOverdefined(IV))
       return false;
-    auto *Const = IV.isConstant() ? Solver.getConstant(IV)
-                                  : UndefValue::get(V->getType());
+    auto *Const =
+        isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
     V->replaceAllUsesWith(Const);
 
-    // TODO: Update the solver here if we want to specialize specialized
-    // functions.
+    for (auto *U : Const->users())
+      if (auto *I = dyn_cast<Instruction>(U))
+        if (Solver.isBlockExecutable(I->getParent()))
+          Solver.visit(I);
+
+    // Remove the instruction from Block and Solver.
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      if (I->isSafeToRemove()) {
+        I->eraseFromParent();
+        Solver.removeLatticeValueFor(I);
+      }
+    }
     return true;
   }
 
@@ -152,6 +324,15 @@ private:
   // also in the cost model.
   unsigned NbFunctionsSpecialized = 0;
 
+  /// Clone the function \p F and remove the ssa_copy intrinsics added by
+  /// the SCCPSolver in the cloned version.
+  Function *cloneCandidateFunction(Function *F) {
+    ValueToValueMapTy EmptyMap;
+    Function *Clone = CloneFunction(F, EmptyMap);
+    removeSSACopy(*Clone);
+    return Clone;
+  }
+
   /// This function decides whether to specialize function \p F based on the
   /// known constant values its arguments can take on. Specialization is
   /// performed on the first interesting argument. Specializations based on
@@ -162,9 +343,8 @@ private:
                           SmallVectorImpl<Function *> &Specializations) {
 
     // Do not specialize the cloned function again.
-    if (SpecializedFuncs.contains(F)) {
+    if (SpecializedFuncs.contains(F))
       return false;
-    }
 
     // If we're optimizing the function for size, we shouldn't specialize it.
     if (F->hasOptSize() ||
@@ -176,8 +356,25 @@ private:
     if (!Solver.isBlockExecutable(&F->getEntryBlock()))
       return false;
 
+    // It wastes time to specialize a function which would get inlined finally.
+    if (F->hasFnAttribute(Attribute::AlwaysInline))
+      return false;
+
     LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName()
                       << "\n");
+
+    // Determine if it would be profitable to create a specialization of the
+    // function where the argument takes on the given constant value. If so,
+    // add the constant to Constants.
+    auto FnSpecCost = getSpecializationCost(F);
+    if (!FnSpecCost.isValid()) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: ";
+               FnSpecCost.print(dbgs()); dbgs() << "\n");
+
     // Determine if we should specialize the function based on the values the
     // argument can take on. If specialization is not profitable, we continue
     // on to the next argument.
@@ -195,7 +392,7 @@ private:
       // be set to false by isArgumentInteresting (that function only adds
       // values to the Constants list that are deemed profitable).
       SmallVector<Constant *, 4> Constants;
-      if (!isArgumentInteresting(&A, Constants, IsPartial)) {
+      if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) {
         LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
         continue;
       }
@@ -214,8 +411,7 @@ private:
       for (auto *C : Constants) {
         // Clone the function. We leave the ValueToValueMap empty to allow
         // IPSCCP to propagate the constant arguments.
-        ValueToValueMapTy EmptyMap;
-        Function *Clone = CloneFunction(F, EmptyMap);
+        Function *Clone = cloneCandidateFunction(F);
         Argument *ClonedArg = Clone->arg_begin() + A.getArgNo();
 
         // Rewrite calls to the function so that they call the clone instead.
@@ -231,9 +427,10 @@ private:
         NbFunctionsSpecialized++;
       }
 
-      // TODO: if we want to support specialize specialized functions, and if
-      // the function has been completely specialized, the original function is
-      // no longer needed, so we would need to mark it unreachable here.
+      // If the function has been completely specialized, the original function
+      // is no longer needed. Mark it unreachable.
+      if (!IsPartial)
+        Solver.markFunctionUnreachable(F);
 
       // FIXME: Only one argument per function.
       return true;
@@ -253,7 +450,11 @@ private:
 
     // If the code metrics reveal that we shouldn't duplicate the function, we
     // shouldn't specialize it. Set the specialization cost to Invalid.
-    if (Metrics.notDuplicatable) {
+    // Or if the lines of codes implies that this function is easy to get
+    // inlined so that we shouldn't specialize it.
+    if (Metrics.notDuplicatable ||
+        (!ForceFunctionSpecialization &&
+         Metrics.NumInsts < SmallFunctionThreshold)) {
       InstructionCost C{};
       C.setInvalid();
       return C;
@@ -379,9 +580,8 @@ private:
   /// argument.
   bool isArgumentInteresting(Argument *A,
                              SmallVectorImpl<Constant *> &Constants,
+                             const InstructionCost &FnSpecCost,
                              bool &IsPartial) {
-    Function *F = A->getParent();
-
     // For now, don't attempt to specialize functions based on the values of
     // composite types.
     if (!A->getType()->isSingleValueType() || A->user_empty())
@@ -420,18 +620,6 @@ private:
       return false;
     }
 
-    // Determine if it would be profitable to create a specialization of the
-    // function where the argument takes on the given constant value. If so,
-    // add the constant to Constants.
-    auto FnSpecCost = getSpecializationCost(F);
-    if (!FnSpecCost.isValid()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
-      return false;
-    }
-
-    LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: ";
-               FnSpecCost.print(dbgs()); dbgs() << "\n");
-
     for (auto *C : PossibleConstants) {
       LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n");
       if (ForceFunctionSpecialization) {
@@ -475,6 +663,12 @@ private:
       if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
         continue;
       auto &CS = *cast<CallBase>(U);
+      // If the call site has attribute minsize set, that callsite won't be
+      // specialized.
+      if (CS.hasFnAttr(Attribute::MinSize)) {
+        AllConstant = false;
+        continue;
+      }
 
       // If the parent of the call site will never be executed, we don't need
       // to worry about the passed value.
@@ -482,11 +676,25 @@ private:
         continue;
 
       auto *V = CS.getArgOperand(A->getArgNo());
+      if (isa<PoisonValue>(V))
+        return false;
+
+      // For now, constant expressions are fine but only if they are function
+      // calls.
+      if (auto *CE =  dyn_cast<ConstantExpr>(V))
+        if (!isa<Function>(CE->getOperand(0)))
+          return false;
+
       // TrackValueOfGlobalVariable only tracks scalar global variables.
       if (auto *GV = dyn_cast<GlobalVariable>(V)) {
-        if (!GV->getValueType()->isSingleValueType()) {
+        // Check if we want to specialize on the address of non-constant
+        // global values.
+        if (!GV->isConstant())
+          if (!SpecializeOnAddresses)
+            return false;
+
+        if (!GV->getValueType()->isSingleValueType())
           return false;
-        }
       }
 
       if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() ||
@@ -506,6 +714,9 @@ private:
   /// This function modifies calls to function \p F whose argument at index \p
   /// ArgNo is equal to constant \p C. The calls are rewritten to call function
   /// \p Clone instead.
+  ///
+  /// Callsites that have been marked with the MinSize function attribute won't
+  /// be specialized and rewritten.
   void rewriteCallSites(Function *F, Function *Clone, Argument &Arg,
                         Constant *C) {
     unsigned ArgNo = Arg.getArgNo();
@@ -527,24 +738,7 @@ private:
     }
   }
 };
-
-/// Function to clean up the left over intrinsics from SCCP util.
-static void cleanup(Module &M) {
-  for (Function &F : M) {
-    for (BasicBlock &BB : F) {
-      for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
-        Instruction *Inst = &*BI++;
-        if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
-          if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
-            Value *Op = II->getOperand(0);
-            Inst->replaceAllUsesWith(Op);
-            Inst->eraseFromParent();
-          }
-        }
-      }
-    }
-  }
-}
+} // namespace
 
 bool llvm::runFunctionSpecialization(
     Module &M, const DataLayout &DL,
@@ -597,12 +791,27 @@ bool llvm::runFunctionSpecialization(
       Solver.trackValueOfGlobalVariable(&G);
   }
 
+  auto &TrackedFuncs = Solver.getArgumentTrackedFunctions();
+  SmallVector<Function *, 16> FuncDecls(TrackedFuncs.begin(),
+                                        TrackedFuncs.end());
+
+  // No tracked functions, so nothing to do: don't run the solver and remove
+  // the ssa_copy intrinsics that may have been introduced.
+  if (TrackedFuncs.empty()) {
+    removeSSACopy(M);
+    return false;
+  }
+
   // Solve for constants.
   auto RunSCCPSolver = [&](auto &WorkList) {
     bool ResolvedUndefs = true;
 
     while (ResolvedUndefs) {
+      // Not running the solver unnecessary is checked in regression test
+      // nothing-to-do.ll, so if this debug message is changed, this regression
+      // test needs updating too.
       LLVM_DEBUG(dbgs() << "FnSpecialization: Running solver\n");
+
       Solver.solve();
       LLVM_DEBUG(dbgs() << "FnSpecialization: Resolving undefs\n");
       ResolvedUndefs = false;
@@ -615,15 +824,14 @@ bool llvm::runFunctionSpecialization(
       for (BasicBlock &BB : *F) {
         if (!Solver.isBlockExecutable(&BB))
           continue;
+        // FIXME: The solver may make changes to the function here, so set
+        // Changed, even if later function specialization does not trigger.
         for (auto &I : make_early_inc_range(BB))
-          FS.tryToReplaceWithConstant(&I);
+          Changed |= FS.tryToReplaceWithConstant(&I);
       }
     }
   };
 
-  auto &TrackedFuncs = Solver.getArgumentTrackedFunctions();
-  SmallVector<Function *, 16> FuncDecls(TrackedFuncs.begin(),
-                                        TrackedFuncs.end());
 #ifndef NDEBUG
   LLVM_DEBUG(dbgs() << "FnSpecialization: Worklist fn decls:\n");
   for (auto *F : FuncDecls)
@@ -637,14 +845,18 @@ bool llvm::runFunctionSpecialization(
   unsigned I = 0;
   while (FuncSpecializationMaxIters != I++ &&
          FS.specializeFunctions(FuncDecls, CurrentSpecializations)) {
-    // TODO: run the solver here for the specialized functions only if we want
-    // to specialize recursively.
+
+    // Run the solver for the specialized functions.
+    RunSCCPSolver(CurrentSpecializations);
+
+    // Replace some unresolved constant arguments.
+    constantArgPropagation(FuncDecls, M, Solver);
 
     CurrentSpecializations.clear();
     Changed = true;
   }
 
   // Clean up the IR by removing ssa_copy intrinsics.
-  cleanup(M);
+  removeSSACopy(M);
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index fb4cb23b837e..5e5d2086adc2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -88,7 +88,7 @@ ModulePass *llvm::createGlobalDCEPass() {
 static bool isEmptyFunction(Function *F) {
   BasicBlock &Entry = F->getEntryBlock();
   for (auto &I : Entry) {
-    if (isa<DbgInfoIntrinsic>(I))
+    if (I.isDebugOrPseudoInst())
       continue;
     if (auto *RI = dyn_cast<ReturnInst>(&I))
       return !RI->getReturnValue();
@@ -210,7 +210,7 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
 
     Constant *Ptr =
         getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
-                           *Caller->getParent());
+                           *Caller->getParent(), VTable);
     if (!Ptr) {
       LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
       VFESafeVTables.erase(VTable);
@@ -416,6 +416,16 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
       // virtual function pointers with null, allowing us to remove the
       // function itself.
       ++NumVFuncs;
+
+      // Detect vfuncs that are referenced as "relative pointers" which are used
+      // in Swift vtables, i.e. entries in the form of:
+      //
+      //   i32 trunc (i64 sub (i64 ptrtoint @f, i64 ptrtoint ...)) to i32)
+      //
+      // In this case, replace the whole "sub" expression with constant 0 to
+      // avoid leaving a weird sub(0, symbol) expression behind.
+      replaceRelativePointerUsersWithZero(F);
+
       F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
     }
     EraseUnusedGlobalValue(F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 8750eb9ecc4e..b2c2efed7db8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -208,9 +208,7 @@ CleanupPointerRootUsers(GlobalVariable *GV,
   SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
 
   // Constants can't be pointers to dynamically allocated memory.
-  for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
-       UI != E;) {
-    User *U = *UI++;
+  for (User *U : llvm::make_early_inc_range(GV->users())) {
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       Value *V = SI->getValueOperand();
       if (isa<Constant>(V)) {
@@ -703,8 +701,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
                !ICmpInst::isSigned(cast<ICmpInst>(U)->getPredicate()) &&
                isa<LoadInst>(U->getOperand(0)) &&
                isa<ConstantPointerNull>(U->getOperand(1))) {
-      assert(isa<GlobalValue>(
-                 cast<LoadInst>(U->getOperand(0))->getPointerOperand()) &&
+      assert(isa<GlobalValue>(cast<LoadInst>(U->getOperand(0))
+                                  ->getPointerOperand()
+                                  ->stripPointerCasts()) &&
              "Should be GlobalVariable");
       // This and only this kind of non-signed ICmpInst is to be replaced with
       // the comparing of the value of the created global init bool later in
@@ -720,22 +719,55 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
 /// Return true if all uses of any loads from GV will trap if the loaded value
 /// is null.  Note that this also permits comparisons of the loaded value
 /// against null, as a special case.
-static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
-  for (const User *U : GV->users())
-    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      SmallPtrSet<const PHINode*, 8> PHIs;
-      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+static bool allUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(GV);
+  while (!Worklist.empty()) {
+    const Value *P = Worklist.pop_back_val();
+    for (auto *U : P->users()) {
+      if (auto *LI = dyn_cast<LoadInst>(U)) {
+        SmallPtrSet<const PHINode *, 8> PHIs;
+        if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+          return false;
+      } else if (auto *SI = dyn_cast<StoreInst>(U)) {
+        // Ignore stores to the global.
+        if (SI->getPointerOperand() != P)
+          return false;
+      } else if (auto *CE = dyn_cast<ConstantExpr>(U)) {
+        if (CE->stripPointerCasts() != GV)
+          return false;
+        // Check further the ConstantExpr.
+        Worklist.push_back(CE);
+      } else {
+        // We don't know or understand this user, bail out.
         return false;
-    } else if (isa<StoreInst>(U)) {
-      // Ignore stores to the global.
-    } else {
-      // We don't know or understand this user, bail out.
-      //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
-      return false;
+      }
     }
+  }
+
   return true;
 }
 
+/// Get all the loads/store uses for global variable \p GV.
+static void allUsesOfLoadAndStores(GlobalVariable *GV,
+                                   SmallVector<Value *, 4> &Uses) {
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(GV);
+  while (!Worklist.empty()) {
+    auto *P = Worklist.pop_back_val();
+    for (auto *U : P->users()) {
+      if (auto *CE = dyn_cast<ConstantExpr>(U)) {
+        Worklist.push_back(CE);
+        continue;
+      }
+
+      assert((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+             "Expect only load or store instructions");
+      Uses.push_back(U);
+    }
+  }
+}
+
 static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
   bool Changed = false;
   for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
@@ -817,8 +849,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(
   bool AllNonStoreUsesGone = true;
 
   // Replace all uses of loads with uses of uses of the stored value.
-  for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
-    User *GlobalUser = *GUI++;
+  for (User *GlobalUser : llvm::make_early_inc_range(GV->users())) {
     if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
       Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
       // If we were able to delete all uses of the loads
@@ -934,9 +965,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
     }
   }
 
-  Constant *RepValue = NewGV;
-  if (NewGV->getType() != GV->getValueType())
-    RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
+  SmallPtrSet<Constant *, 1> RepValues;
+  RepValues.insert(NewGV);
 
   // If there is a comparison against null, we will insert a global bool to
   // keep track of whether the global was initialized yet or not.
@@ -947,9 +977,11 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
                        GV->getName()+".init", GV->getThreadLocalMode());
   bool InitBoolUsed = false;
 
-  // Loop over all uses of GV, processing them in turn.
-  while (!GV->use_empty()) {
-    if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
+  // Loop over all instruction uses of GV, processing them in turn.
+  SmallVector<Value *, 4> Guses;
+  allUsesOfLoadAndStores(GV, Guses);
+  for (auto *U : Guses) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       // The global is initialized when the store to it occurs. If the stored
       // value is null value, the global bool is set to false, otherwise true.
       new StoreInst(ConstantInt::getBool(
@@ -961,12 +993,14 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       continue;
     }
 
-    LoadInst *LI = cast<LoadInst>(GV->user_back());
+    LoadInst *LI = cast<LoadInst>(U);
     while (!LI->use_empty()) {
       Use &LoadUse = *LI->use_begin();
       ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
       if (!ICI) {
-        LoadUse = RepValue;
+        auto *CE = ConstantExpr::getBitCast(NewGV, LI->getType());
+        RepValues.insert(CE);
+        LoadUse.set(CE);
         continue;
       }
 
@@ -1012,40 +1046,53 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   // To further other optimizations, loop over all users of NewGV and try to
   // constant prop them.  This will promote GEP instructions with constant
   // indices into GEP constant-exprs, which will allow global-opt to hack on it.
-  ConstantPropUsersOf(NewGV, DL, TLI);
-  if (RepValue != NewGV)
-    ConstantPropUsersOf(RepValue, DL, TLI);
+  for (auto *CE : RepValues)
+    ConstantPropUsersOf(CE, DL, TLI);
 
   return NewGV;
 }
 
-/// Scan the use-list of V checking to make sure that there are no complex uses
-/// of V.  We permit simple things like dereferencing the pointer, but not
+/// Scan the use-list of GV checking to make sure that there are no complex uses
+/// of GV.  We permit simple things like dereferencing the pointer, but not
 /// storing through the address, unless it is to the specified global.
 static bool
-valueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
+valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI,
                                           const GlobalVariable *GV) {
-  for (const User *U : V->users()) {
-    const Instruction *Inst = cast<Instruction>(U);
+  SmallPtrSet<const Value *, 4> Visited;
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(CI);
 
-    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
-      continue; // Fine, ignore.
-    }
+  while (!Worklist.empty()) {
+    const Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
 
-    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
-        return false;  // Storing the pointer itself... bad.
-      continue; // Otherwise, storing through it, or storing into GV... fine.
-    }
+    for (const Use &VUse : V->uses()) {
+      const User *U = VUse.getUser();
+      if (isa<LoadInst>(U) || isa<CmpInst>(U))
+        continue; // Fine, ignore.
 
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
-      if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV))
-        return false;
-      continue;
-    }
+      if (auto *SI = dyn_cast<StoreInst>(U)) {
+        if (SI->getValueOperand() == V &&
+            SI->getPointerOperand()->stripPointerCasts() != GV)
+          return false; // Storing the pointer not into GV... bad.
+        continue; // Otherwise, storing through it, or storing into GV... fine.
+      }
 
-    return false;
+      if (auto *BCI = dyn_cast<BitCastInst>(U)) {
+        Worklist.push_back(BCI);
+        continue;
+      }
+
+      if (auto *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+        Worklist.push_back(GEPI);
+        continue;
+      }
+
+      return false;
+    }
   }
+
   return true;
 }
 
@@ -1066,12 +1113,12 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
   // been reached).  To do this, we check to see if all uses of the global
   // would trap if the global were null: this proves that they must all
   // happen after the malloc.
-  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+  if (!allUsesOfLoadedValueWillTrapIfNull(GV))
     return false;
 
   // We can't optimize this if the malloc itself is used in a complex way,
   // for example, being stored into multiple globals.  This allows the
-  // malloc to be stored into the specified global, loaded icmp'd.
+  // malloc to be stored into the specified global, loaded, gep, icmp'd.
   // These are all things we could transform to using the global for.
   if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV))
     return false;
@@ -1112,6 +1159,7 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
   // value was null.
   if (GV->getInitializer()->getType()->isPointerTy() &&
       GV->getInitializer()->isNullValue() &&
+      StoredOnceVal->getType()->isPointerTy() &&
       !NullPointerIsDefined(
           nullptr /* F */,
           GV->getInitializer()->getType()->getPointerAddressSpace())) {
@@ -1442,8 +1490,7 @@ static void makeAllConstantUsesInstructions(Constant *C) {
     append_range(UUsers, U->users());
     for (auto *UU : UUsers) {
       Instruction *UI = cast<Instruction>(UU);
-      Instruction *NewU = U->getAsInstruction();
-      NewU->insertBefore(UI);
+      Instruction *NewU = U->getAsInstruction(UI);
       UI->replaceUsesOfWith(U, NewU);
     }
     // We've replaced all the uses, so destroy the constant. (destroyConstant
@@ -1456,6 +1503,7 @@ static void makeAllConstantUsesInstructions(Constant *C) {
 /// it if possible.  If we make a change, return true.
 static bool
 processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
+                      function_ref<TargetTransformInfo &(Function &)> GetTTI,
                       function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                       function_ref<DominatorTree &(Function &)> LookupDomTree) {
   auto &DL = GV->getParent()->getDataLayout();
@@ -1554,43 +1602,57 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     if (SRAGlobal(GV, DL))
       return true;
   }
-  if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
+  Value *StoredOnceValue = GS.getStoredOnceValue();
+  if (GS.StoredType == GlobalStatus::StoredOnce && StoredOnceValue) {
+    // Avoid speculating constant expressions that might trap (div/rem).
+    auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue);
+    if (SOVConstant && SOVConstant->canTrap())
+      return Changed;
+
+    Function &StoreFn =
+        const_cast<Function &>(*GS.StoredOnceStore->getFunction());
+    bool CanHaveNonUndefGlobalInitializer =
+        GetTTI(StoreFn).canHaveNonUndefGlobalInitializerInAddressSpace(
+            GV->getType()->getAddressSpace());
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
     // global.  This allows us to mark it constant.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-      if (isa<UndefValue>(GV->getInitializer())) {
-        // Change the initial value here.
-        GV->setInitializer(SOVConstant);
-
-        // Clean up any obviously simplifiable users now.
-        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
-
-        if (GV->use_empty()) {
-          LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
-                            << "simplify all users and delete global!\n");
-          GV->eraseFromParent();
-          ++NumDeleted;
-        }
-        ++NumSubstitute;
-        return true;
+    // This is restricted to address spaces that allow globals to have
+    // initializers. NVPTX, for example, does not support initializers for
+    // shared memory (AS 3).
+    if (SOVConstant && SOVConstant->getType() == GV->getValueType() &&
+        isa<UndefValue>(GV->getInitializer()) &&
+        CanHaveNonUndefGlobalInitializer) {
+      // Change the initial value here.
+      GV->setInitializer(SOVConstant);
+
+      // Clean up any obviously simplifiable users now.
+      CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+
+      if (GV->use_empty()) {
+        LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
+                          << "simplify all users and delete global!\n");
+        GV->eraseFromParent();
+        ++NumDeleted;
       }
+      ++NumSubstitute;
+      return true;
+    }
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
-                                 GetTLI))
+    if (optimizeOnceStoredGlobal(GV, StoredOnceValue, GS.Ordering, DL, GetTLI))
       return true;
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
-    // boolean.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
-      if (GS.Ordering == AtomicOrdering::NotAtomic) {
-        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
-          ++NumShrunkToBool;
-          return true;
-        }
+    // boolean. Skip this optimization for AS that doesn't allow an initializer.
+    if (SOVConstant && GS.Ordering == AtomicOrdering::NotAtomic &&
+        (!isa<UndefValue>(GV->getInitializer()) ||
+         CanHaveNonUndefGlobalInitializer)) {
+      if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+        ++NumShrunkToBool;
+        return true;
       }
     }
   }
@@ -1602,6 +1664,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
 /// make a change, return true.
 static bool
 processGlobal(GlobalValue &GV,
+              function_ref<TargetTransformInfo &(Function &)> GetTTI,
               function_ref<TargetLibraryInfo &(Function &)> GetTLI,
               function_ref<DominatorTree &(Function &)> LookupDomTree) {
   if (GV.getName().startswith("llvm."))
@@ -1634,7 +1697,8 @@ processGlobal(GlobalValue &GV,
   if (GVar->isConstant() || !GVar->hasInitializer())
     return Changed;
 
-  return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
+  return processInternalGlobal(GVar, GS, GetTTI, GetTLI, LookupDomTree) ||
+         Changed;
 }
 
 /// Walk all of the direct calls of the specified function, changing them to
@@ -1651,7 +1715,7 @@ static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
                                Attribute::AttrKind A) {
   unsigned AttrIndex;
   if (Attrs.hasAttrSomewhere(A, &AttrIndex))
-    return Attrs.removeAttribute(C, AttrIndex, A);
+    return Attrs.removeAttributeAtIndex(C, AttrIndex, A);
   return Attrs;
 }
 
@@ -1864,10 +1928,8 @@ static void RemovePreallocated(Function *F) {
       Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
       if (!AllocaReplacement) {
         auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
-        auto *ArgType = UseCall
-                            ->getAttribute(AttributeList::FunctionIndex,
-                                           Attribute::Preallocated)
-                            .getValueAsType();
+        auto *ArgType =
+            UseCall->getFnAttr(Attribute::Preallocated).getValueAsType();
         auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
         Builder.SetInsertPoint(InsertBefore);
         auto *Alloca =
@@ -1897,26 +1959,22 @@ OptimizeFunctions(Module &M,
   bool Changed = false;
 
   std::vector<Function *> AllCallsCold;
-  for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
-    Function *F = &*FI++;
-    if (hasOnlyColdCalls(*F, GetBFI))
-      AllCallsCold.push_back(F);
-  }
+  for (Function &F : llvm::make_early_inc_range(M))
+    if (hasOnlyColdCalls(F, GetBFI))
+      AllCallsCold.push_back(&F);
 
   // Optimize functions.
-  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
-    Function *F = &*FI++;
-
+  for (Function &F : llvm::make_early_inc_range(M)) {
     // Don't perform global opt pass on naked functions; we don't want fast
     // calling conventions for naked functions.
-    if (F->hasFnAttribute(Attribute::Naked))
+    if (F.hasFnAttribute(Attribute::Naked))
       continue;
 
     // Functions without names cannot be referenced outside this module.
-    if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
-      F->setLinkage(GlobalValue::InternalLinkage);
+    if (!F.hasName() && !F.isDeclaration() && !F.hasLocalLinkage())
+      F.setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(*F, NotDiscardableComdats)) {
+    if (deleteIfDead(F, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
@@ -1931,17 +1989,17 @@ OptimizeFunctions(Module &M,
     // some more complicated logic to break these cycles.
     // Removing unreachable blocks might invalidate the dominator so we
     // recalculate it.
-    if (!F->isDeclaration()) {
-      if (removeUnreachableBlocks(*F)) {
-        auto &DT = LookupDomTree(*F);
-        DT.recalculate(*F);
+    if (!F.isDeclaration()) {
+      if (removeUnreachableBlocks(F)) {
+        auto &DT = LookupDomTree(F);
+        DT.recalculate(F);
         Changed = true;
       }
     }
 
-    Changed |= processGlobal(*F, GetTLI, LookupDomTree);
+    Changed |= processGlobal(F, GetTTI, GetTLI, LookupDomTree);
 
-    if (!F->hasLocalLinkage())
+    if (!F.hasLocalLinkage())
       continue;
 
     // If we have an inalloca parameter that we can safely remove the
@@ -1949,56 +2007,55 @@ OptimizeFunctions(Module &M,
     // wouldn't be safe in the presence of inalloca.
     // FIXME: We should also hoist alloca affected by this to the entry
     // block if possible.
-    if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
-        !F->hasAddressTaken() && !hasMustTailCallers(F)) {
-      RemoveAttribute(F, Attribute::InAlloca);
+    if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
+        !F.hasAddressTaken() && !hasMustTailCallers(&F)) {
+      RemoveAttribute(&F, Attribute::InAlloca);
       Changed = true;
     }
 
     // FIXME: handle invokes
     // FIXME: handle musttail
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
-      if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
-          !hasInvokeCallers(F)) {
-        RemovePreallocated(F);
+    if (F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+      if (!F.hasAddressTaken() && !hasMustTailCallers(&F) &&
+          !hasInvokeCallers(&F)) {
+        RemovePreallocated(&F);
         Changed = true;
       }
       continue;
     }
 
-    if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
+    if (hasChangeableCC(&F) && !F.isVarArg() && !F.hasAddressTaken()) {
       NumInternalFunc++;
-      TargetTransformInfo &TTI = GetTTI(*F);
+      TargetTransformInfo &TTI = GetTTI(F);
       // Change the calling convention to coldcc if either stress testing is
       // enabled or the target would like to use coldcc on functions which are
       // cold at all call sites and the callers contain no other non coldcc
       // calls.
       if (EnableColdCCStressTest ||
-          (TTI.useColdCCForColdCall(*F) &&
-           isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) {
-        F->setCallingConv(CallingConv::Cold);
-        changeCallSitesToColdCC(F);
+          (TTI.useColdCCForColdCall(F) &&
+           isValidCandidateForColdCC(F, GetBFI, AllCallsCold))) {
+        F.setCallingConv(CallingConv::Cold);
+        changeCallSitesToColdCC(&F);
         Changed = true;
         NumColdCC++;
       }
     }
 
-    if (hasChangeableCC(F) && !F->isVarArg() &&
-        !F->hasAddressTaken()) {
+    if (hasChangeableCC(&F) && !F.isVarArg() && !F.hasAddressTaken()) {
       // If this function has a calling convention worth changing, is not a
       // varargs function, and is only called directly, promote it to use the
       // Fast calling convention.
-      F->setCallingConv(CallingConv::Fast);
-      ChangeCalleesToFastCall(F);
+      F.setCallingConv(CallingConv::Fast);
+      ChangeCalleesToFastCall(&F);
       ++NumFastCallFns;
       Changed = true;
     }
 
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
-        !F->hasAddressTaken()) {
+    if (F.getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+        !F.hasAddressTaken()) {
       // The function is not used by a trampoline intrinsic, so it is safe
       // to remove the 'nest' attribute.
-      RemoveAttribute(F, Attribute::Nest);
+      RemoveAttribute(&F, Attribute::Nest);
       ++NumNestRemoved;
       Changed = true;
     }
@@ -2008,35 +2065,34 @@ OptimizeFunctions(Module &M,
 
 static bool
 OptimizeGlobalVars(Module &M,
+                   function_ref<TargetTransformInfo &(Function &)> GetTTI,
                    function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                    function_ref<DominatorTree &(Function &)> LookupDomTree,
                    SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   bool Changed = false;
 
-  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-       GVI != E; ) {
-    GlobalVariable *GV = &*GVI++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     // Global variables without names cannot be referenced outside this module.
-    if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
-      GV->setLinkage(GlobalValue::InternalLinkage);
+    if (!GV.hasName() && !GV.isDeclaration() && !GV.hasLocalLinkage())
+      GV.setLinkage(GlobalValue::InternalLinkage);
     // Simplify the initializer.
-    if (GV->hasInitializer())
-      if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
+    if (GV.hasInitializer())
+      if (auto *C = dyn_cast<Constant>(GV.getInitializer())) {
         auto &DL = M.getDataLayout();
         // TLI is not used in the case of a Constant, so use default nullptr
         // for that optional parameter, since we don't have a Function to
         // provide GetTLI anyway.
         Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
         if (New != C)
-          GV->setInitializer(New);
+          GV.setInitializer(New);
       }
 
-    if (deleteIfDead(*GV, NotDiscardableComdats)) {
+    if (deleteIfDead(GV, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
-    Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
+    Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
   return Changed;
 }
@@ -2425,24 +2481,21 @@ OptimizeGlobalAliases(Module &M,
   for (GlobalValue *GV : Used.used())
     Used.compilerUsedErase(GV);
 
-  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E;) {
-    GlobalAlias *J = &*I++;
-
+  for (GlobalAlias &J : llvm::make_early_inc_range(M.aliases())) {
     // Aliases without names cannot be referenced outside this module.
-    if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
-      J->setLinkage(GlobalValue::InternalLinkage);
+    if (!J.hasName() && !J.isDeclaration() && !J.hasLocalLinkage())
+      J.setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(*J, NotDiscardableComdats)) {
+    if (deleteIfDead(J, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
     // If the alias can change at link time, nothing can be done - bail out.
-    if (J->isInterposable())
+    if (J.isInterposable())
       continue;
 
-    Constant *Aliasee = J->getAliasee();
+    Constant *Aliasee = J.getAliasee();
     GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
     // We can't trivially replace the alias with the aliasee if the aliasee is
     // non-trivial in some way. We also can't replace the alias with the aliasee
@@ -2455,31 +2508,31 @@ OptimizeGlobalAliases(Module &M,
 
     // Make all users of the alias use the aliasee instead.
     bool RenameTarget;
-    if (!hasUsesToReplace(*J, Used, RenameTarget))
+    if (!hasUsesToReplace(J, Used, RenameTarget))
       continue;
 
-    J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
+    J.replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J.getType()));
     ++NumAliasesResolved;
     Changed = true;
 
     if (RenameTarget) {
       // Give the aliasee the name, linkage and other attributes of the alias.
-      Target->takeName(&*J);
-      Target->setLinkage(J->getLinkage());
-      Target->setDSOLocal(J->isDSOLocal());
-      Target->setVisibility(J->getVisibility());
-      Target->setDLLStorageClass(J->getDLLStorageClass());
+      Target->takeName(&J);
+      Target->setLinkage(J.getLinkage());
+      Target->setDSOLocal(J.isDSOLocal());
+      Target->setVisibility(J.getVisibility());
+      Target->setDLLStorageClass(J.getDLLStorageClass());
 
-      if (Used.usedErase(&*J))
+      if (Used.usedErase(&J))
         Used.usedInsert(Target);
 
-      if (Used.compilerUsedErase(&*J))
+      if (Used.compilerUsedErase(&J))
         Used.compilerUsedInsert(Target);
-    } else if (mayHaveOtherReferences(*J, Used))
+    } else if (mayHaveOtherReferences(J, Used))
       continue;
 
     // Delete the alias.
-    M.getAliasList().erase(J);
+    M.getAliasList().erase(&J);
     ++NumAliasesRemoved;
     Changed = true;
   }
@@ -2526,7 +2579,7 @@ static bool cxxDtorIsEmpty(const Function &Fn) {
     return false;
 
   for (auto &I : Fn.getEntryBlock()) {
-    if (isa<DbgInfoIntrinsic>(I))
+    if (I.isDebugOrPseudoInst())
       continue;
     if (isa<ReturnInst>(I))
       return true;
@@ -2552,12 +2605,11 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   // and remove them.
   bool Changed = false;
 
-  for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
-       I != E;) {
+  for (User *U : llvm::make_early_inc_range(CXAAtExitFn->users())) {
     // We're only interested in calls. Theoretically, we could handle invoke
     // instructions as well, but neither llvm-gcc nor clang generate invokes
     // to __cxa_atexit.
-    CallInst *CI = dyn_cast<CallInst>(*I++);
+    CallInst *CI = dyn_cast<CallInst>(U);
     if (!CI)
       continue;
 
@@ -2614,8 +2666,8 @@ static bool optimizeGlobalsInModule(
     });
 
     // Optimize non-address-taken globals.
-    LocalChange |=
-        OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
+    LocalChange |= OptimizeGlobalVars(M, GetTTI, GetTLI, LookupDomTree,
+                                      NotDiscardableComdats);
 
     // Resolve aliases, when possible.
     LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index 365b269dc3bf..e7d698c42fcf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -154,11 +154,8 @@ static bool splitGlobals(Module &M) {
     return false;
 
   bool Changed = false;
-  for (auto I = M.global_begin(); I != M.global_end();) {
-    GlobalVariable &GV = *I;
-    ++I;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals()))
     Changed |= splitGlobal(GV);
-  }
   return Changed;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp
index adf9ffba5780..b8a314c54f18 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -33,6 +34,10 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+// A command flag to be used for debugging to exclude branches from similarity
+// matching and outlining.
+extern cl::opt<bool> DisableBranches;
+
 // Set to true if the user wants the ir outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
 // functions. Since the outliner is confined to a single module (modulo LTO),
@@ -71,8 +76,12 @@ struct OutlinableGroup {
   /// for extraction.
   bool IgnoreGroup = false;
 
-  /// The return block for the overall function.
-  BasicBlock *EndBB = nullptr;
+  /// The return blocks for the overall function.
+  DenseMap<Value *, BasicBlock *> EndBBs;
+
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
 
   /// A set containing the different GVN store sets needed. Each array contains
   /// a sorted list of the different values that need to be stored into output
@@ -87,6 +96,14 @@ struct OutlinableGroup {
   /// index in ArgumentTypes is an output argument.
   unsigned NumAggregateInputs = 0;
 
+  /// The mapping of the canonical numbering of the values in outlined sections
+  /// to specific arguments.
+  DenseMap<unsigned, unsigned> CanonicalNumberToAggArg;
+
+  /// The number of branches in the region target a basic block that is outside
+  /// of the region.
+  unsigned BranchesToOutside = 0;
+
   /// The number of instructions that will be outlined by extracting \ref
   /// Regions.
   InstructionCost Benefit = 0;
@@ -118,20 +135,67 @@ struct OutlinableGroup {
 /// \param SourceBB - the BasicBlock to pull Instructions from.
 /// \param TargetBB - the BasicBlock to put Instruction into.
 static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) {
-  BasicBlock::iterator BBCurr, BBEnd, BBNext;
-  for (BBCurr = SourceBB.begin(), BBEnd = SourceBB.end(); BBCurr != BBEnd;
-       BBCurr = BBNext) {
-    BBNext = std::next(BBCurr);
-    BBCurr->moveBefore(TargetBB, TargetBB.end());
-  }
+  for (Instruction &I : llvm::make_early_inc_range(SourceBB))
+    I.moveBefore(TargetBB, TargetBB.end());
+}
+
+/// A function to sort the keys of \p Map, which must be a mapping of constant
+/// values to basic blocks and return it in \p SortedKeys
+///
+/// \param SortedKeys - The vector the keys will be return in and sorted.
+/// \param Map - The DenseMap containing keys to sort.
+static void getSortedConstantKeys(std::vector<Value *> &SortedKeys,
+                                  DenseMap<Value *, BasicBlock *> &Map) {
+  for (auto &VtoBB : Map)
+    SortedKeys.push_back(VtoBB.first);
+
+  stable_sort(SortedKeys, [](const Value *LHS, const Value *RHS) {
+    const ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS);
+    const ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS);
+    assert(RHSC && "Not a constant integer in return value?");
+    assert(LHSC && "Not a constant integer in return value?");
+
+    return LHSC->getLimitedValue() < RHSC->getLimitedValue();
+  });
+}
+
+Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other,
+                                                  Value *V) {
+  Optional<unsigned> GVN = Candidate->getGVN(V);
+  assert(GVN.hasValue() && "No GVN for incoming value");
+  Optional<unsigned> CanonNum = Candidate->getCanonicalNum(*GVN);
+  Optional<unsigned> FirstGVN = Other.Candidate->fromCanonicalNum(*CanonNum);
+  Optional<Value *> FoundValueOpt = Other.Candidate->fromGVN(*FirstGVN);
+  return FoundValueOpt.getValueOr(nullptr);
 }
 
 void OutlinableRegion::splitCandidate() {
   assert(!CandidateSplit && "Candidate already split!");
 
+  Instruction *BackInst = Candidate->backInstruction();
+
+  Instruction *EndInst = nullptr;
+  // Check whether the last instruction is a terminator, if it is, we do
+  // not split on the following instruction. We leave the block as it is.  We
+  // also check that this is not the last instruction in the Module, otherwise
+  // the check for whether the current following instruction matches the
+  // previously recorded instruction will be incorrect.
+  if (!BackInst->isTerminator() ||
+      BackInst->getParent() != &BackInst->getFunction()->back()) {
+    EndInst = Candidate->end()->Inst;
+    assert(EndInst && "Expected an end instruction?");
+  }
+
+  // We check if the current instruction following the last instruction in the
+  // region is the same as the recorded instruction following the last
+  // instruction. If they do not match, there could be problems in rewriting
+  // the program after outlining, so we ignore it.
+  if (!BackInst->isTerminator() &&
+      EndInst != BackInst->getNextNonDebugInstruction())
+    return;
+
   Instruction *StartInst = (*Candidate->begin()).Inst;
-  Instruction *EndInst = (*Candidate->end()).Inst;
-  assert(StartInst && EndInst && "Expected a start and end instruction?");
+  assert(StartInst && "Expected a start instruction?");
   StartBB = StartInst->getParent();
   PrevBB = StartBB;
 
@@ -153,13 +217,20 @@ void OutlinableRegion::splitCandidate() {
   std::string OriginalName = PrevBB->getName().str();
 
   StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline");
-
-  // This is the case for the inner block since we do not have to include
-  // multiple blocks.
-  EndBB = StartBB;
-  FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+  PrevBB->replaceSuccessorsPhiUsesWith(PrevBB, StartBB);
 
   CandidateSplit = true;
+  if (!BackInst->isTerminator()) {
+    EndBB = EndInst->getParent();
+    FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+    EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB);
+    FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB);
+    return;
+  }
+
+  EndBB = BackInst->getParent();
+  EndsInBranch = true;
+  FollowBB = nullptr;
 }
 
 void OutlinableRegion::reattachCandidate() {
@@ -180,7 +251,6 @@ void OutlinableRegion::reattachCandidate() {
   //   inst3
   //   inst4
   assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
-  assert(FollowBB != nullptr && "StartBB for Candidate is not defined!");
 
   // StartBB should only have one predecessor since we put an unconditional
   // branch at the end of PrevBB when we split the BasicBlock.
@@ -189,21 +259,24 @@ void OutlinableRegion::reattachCandidate() {
          "No Predecessor for the region start basic block!");
 
   assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
-  assert(EndBB->getTerminator() && "Terminator removed from EndBB!");
   PrevBB->getTerminator()->eraseFromParent();
-  EndBB->getTerminator()->eraseFromParent();
 
   moveBBContents(*StartBB, *PrevBB);
 
   BasicBlock *PlacementBB = PrevBB;
   if (StartBB != EndBB)
     PlacementBB = EndBB;
-  moveBBContents(*FollowBB, *PlacementBB);
+  if (!EndsInBranch && PlacementBB->getUniqueSuccessor() != nullptr) {
+    assert(FollowBB != nullptr && "FollowBB for Candidate is not defined!");
+    assert(PlacementBB->getTerminator() && "Terminator removed from EndBB!");
+    PlacementBB->getTerminator()->eraseFromParent();
+    moveBBContents(*FollowBB, *PlacementBB);
+    PlacementBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
+    FollowBB->eraseFromParent();
+  }
 
   PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB);
-  PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
   StartBB->eraseFromParent();
-  FollowBB->eraseFromParent();
 
   // Make sure to save changes back to the StartBB.
   StartBB = PrevBB;
@@ -261,8 +334,9 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
   // division instruction for targets that have a native division instruction.
   // To be overly conservative, we only add 1 to the number of instructions for
   // each division instruction.
-  for (Instruction &I : *StartBB) {
-    switch (I.getOpcode()) {
+  for (IRInstructionData &ID : *Candidate) {
+    Instruction *I = ID.Inst;
+    switch (I->getOpcode()) {
     case Instruction::FDiv:
     case Instruction::FRem:
     case Instruction::SDiv:
@@ -272,7 +346,7 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
       Benefit += 1;
       break;
     default:
-      Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+      Benefit += TTI.getInstructionCost(I, TargetTransformInfo::TCK_CodeSize);
       break;
     }
   }
@@ -373,8 +447,24 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
                                      unsigned FunctionNameSuffix) {
   assert(!Group.OutlinedFunction && "Function is already defined!");
 
+  Type *RetTy = Type::getVoidTy(M.getContext());
+  // All extracted functions _should_ have the same return type at this point
+  // since the similarity identifier ensures that all branches outside of the
+  // region occur in the same place.
+
+  // NOTE: Should we ever move to the model that uses a switch at every point
+  // needed, meaning that we could branch within the region or out, it is
+  // possible that we will need to switch to using the most general case all of
+  // the time.
+  for (OutlinableRegion *R : Group.Regions) {
+    Type *ExtractedFuncType = R->ExtractedFunction->getReturnType();
+    if ((RetTy->isVoidTy() && !ExtractedFuncType->isVoidTy()) ||
+        (RetTy->isIntegerTy(1) && ExtractedFuncType->isIntegerTy(16)))
+      RetTy = ExtractedFuncType;
+  }
+
   Group.OutlinedFunctionType = FunctionType::get(
-      Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false);
+      RetTy, Group.ArgumentTypes, false);
 
   // These functions will only be called from within the same module, so
   // we can set an internal linkage.
@@ -430,21 +520,23 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
 ///
 /// \param [in] Old - The function to move the basic blocks from.
 /// \param [in] New - The function to move the basic blocks to.
-/// \returns the first return block for the function in New.
-static BasicBlock *moveFunctionData(Function &Old, Function &New) {
-  Function::iterator CurrBB, NextBB, FinalBB;
-  BasicBlock *NewEnd = nullptr;
-  std::vector<Instruction *> DebugInsts;
-  for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB;
-       CurrBB = NextBB) {
-    NextBB = std::next(CurrBB);
-    CurrBB->removeFromParent();
-    CurrBB->insertInto(&New);
-    Instruction *I = CurrBB->getTerminator();
-    if (isa<ReturnInst>(I))
-      NewEnd = &(*CurrBB);
-
-    for (Instruction &Val : *CurrBB) {
+/// \param [out] NewEnds - The return blocks of the new overall function.
+static void moveFunctionData(Function &Old, Function &New,
+                             DenseMap<Value *, BasicBlock *> &NewEnds) {
+  for (BasicBlock &CurrBB : llvm::make_early_inc_range(Old)) {
+    CurrBB.removeFromParent();
+    CurrBB.insertInto(&New);
+    Instruction *I = CurrBB.getTerminator();
+
+    // For each block we find a return instruction is, it is a potential exit
+    // path for the function.  We keep track of each block based on the return
+    // value here.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
+      NewEnds.insert(std::make_pair(RI->getReturnValue(), &CurrBB));
+
+    std::vector<Instruction *> DebugInsts;
+
+    for (Instruction &Val : CurrBB) {
       // We must handle the scoping of called functions differently than
       // other outlined instructions.
       if (!isa<CallInst>(&Val)) {
@@ -476,8 +568,7 @@ static BasicBlock *moveFunctionData(Function &Old, Function &New) {
       I->eraseFromParent();
   }
 
-  assert(NewEnd && "No return instruction for new function?");
-  return NewEnd;
+  assert(NewEnds.size() > 0 && "No return instruction for new function?");
 }
 
 /// Find the the constants that will need to be lifted into arguments
@@ -664,11 +755,22 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
   // function to account for the extracted constants, we have two different
   // counters as we find extracted arguments, and as we come across overall
   // arguments.
+
+  // Additionally, in our first pass, for the first extracted function,
+  // we find argument locations for the canonical value numbering.  This
+  // numbering overrides any discovered location for the extracted code.
   for (unsigned InputVal : InputGVNs) {
+    Optional<unsigned> CanonicalNumberOpt = C.getCanonicalNum(InputVal);
+    assert(CanonicalNumberOpt.hasValue() && "Canonical number not found?");
+    unsigned CanonicalNumber = CanonicalNumberOpt.getValue();
+
     Optional<Value *> InputOpt = C.fromGVN(InputVal);
     assert(InputOpt.hasValue() && "Global value number not found?");
     Value *Input = InputOpt.getValue();
 
+    DenseMap<unsigned, unsigned>::iterator AggArgIt =
+        Group.CanonicalNumberToAggArg.find(CanonicalNumber);
+
     if (!Group.InputTypesSet) {
       Group.ArgumentTypes.push_back(Input->getType());
       // If the input value has a swifterr attribute, make sure to mark the
@@ -684,17 +786,34 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
     // Check if we have a constant. If we do add it to the overall argument
     // number to Constant map for the region, and continue to the next input.
     if (Constant *CST = dyn_cast<Constant>(Input)) {
-      Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST));
+      if (AggArgIt != Group.CanonicalNumberToAggArg.end())
+        Region.AggArgToConstant.insert(std::make_pair(AggArgIt->second, CST));
+      else {
+        Group.CanonicalNumberToAggArg.insert(
+            std::make_pair(CanonicalNumber, TypeIndex));
+        Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST));
+      }
       TypeIndex++;
       continue;
     }
 
     // It is not a constant, we create the mapping from extracted argument list
-    // to the overall argument list.
+    // to the overall argument list, using the canonical location, if it exists.
     assert(ArgInputs.count(Input) && "Input cannot be found!");
 
-    Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex));
-    Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex));
+    if (AggArgIt != Group.CanonicalNumberToAggArg.end()) {
+      if (OriginalIndex != AggArgIt->second)
+        Region.ChangedArgOrder = true;
+      Region.ExtractedArgToAgg.insert(
+          std::make_pair(OriginalIndex, AggArgIt->second));
+      Region.AggArgToExtracted.insert(
+          std::make_pair(AggArgIt->second, OriginalIndex));
+    } else {
+      Group.CanonicalNumberToAggArg.insert(
+          std::make_pair(CanonicalNumber, TypeIndex));
+      Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex));
+      Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex));
+    }
     OriginalIndex++;
     TypeIndex++;
   }
@@ -718,10 +837,41 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
 /// \param [in] Outputs - The values found by the code extractor.
 static void
 findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
-                                          ArrayRef<Value *> Outputs) {
+                                          SetVector<Value *> &Outputs) {
   OutlinableGroup &Group = *Region.Parent;
   IRSimilarityCandidate &C = *Region.Candidate;
 
+  SmallVector<BasicBlock *> BE;
+  DenseSet<BasicBlock *> BBSet;
+  C.getBasicBlocks(BBSet, BE);
+
+  // Find the exits to the region.
+  SmallPtrSet<BasicBlock *, 1> Exits;
+  for (BasicBlock *Block : BE)
+    for (BasicBlock *Succ : successors(Block))
+      if (!BBSet.contains(Succ))
+        Exits.insert(Succ);
+
+  // After determining which blocks exit to PHINodes, we add these PHINodes to
+  // the set of outputs to be processed.  We also check the incoming values of
+  // the PHINodes for whether they should no longer be considered outputs.
+  for (BasicBlock *ExitBB : Exits) {
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      SmallVector<unsigned, 2> IncomingVals;
+      for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
+        if (BBSet.contains(PN.getIncomingBlock(Idx)))
+          IncomingVals.push_back(Idx);
+
+      // Do not process PHI if there is one (or fewer) predecessor from region.
+      if (IncomingVals.size() <= 1)
+        continue;
+
+      Region.IgnoreRegion = true;
+      return;
+    }
+  }
+
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
 
@@ -797,7 +947,7 @@ void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
 
   // Map the outputs found by the CodeExtractor to the arguments found for
   // the overall function.
-  findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef());
+  findExtractedOutputToOverallOutputMapping(Region, Outputs);
 }
 
 /// Replace the extracted function in the Region with a call to the overall
@@ -820,9 +970,10 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
   assert(AggFunc && "Function to replace with is nullptr?");
 
   // If the arguments are the same size, there are not values that need to be
-  // made argument, or different output registers to handle.  We can simply
-  // replace the called function in this case.
-  if (AggFunc->arg_size() == Call->arg_size()) {
+  // made into an argument, the argument ordering has not been change, or
+  // different output registers to handle.  We can simply replace the called
+  // function in this case.
+  if (!Region.ChangedArgOrder && AggFunc->arg_size() == Call->arg_size()) {
     LLVM_DEBUG(dbgs() << "Replace call to " << *Call << " with call to "
                       << *AggFunc << " with same number of arguments\n");
     Call->setCalledFunction(AggFunc);
@@ -895,6 +1046,9 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 
   // Transfer any debug information.
   Call->setDebugLoc(Region.Call->getDebugLoc());
+  // Since our output may determine which branch we go to, we make sure to
+  // propogate this new call value through the module.
+  OldCall->replaceAllUsesWith(Call);
 
   // Remove the old instruction.
   OldCall->eraseFromParent();
@@ -913,13 +1067,23 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 // region with the arguments of the function for an OutlinableGroup.
 //
 /// \param [in] Region - The region of extracted code to be changed.
-/// \param [in,out] OutputBB - The BasicBlock for the output stores for this
+/// \param [in,out] OutputBBs - The BasicBlock for the output stores for this
 /// region.
-static void replaceArgumentUses(OutlinableRegion &Region,
-                                BasicBlock *OutputBB) {
+/// \param [in] FirstFunction - A flag to indicate whether we are using this
+/// function to define the overall outlined function for all the regions, or
+/// if we are operating on one of the following regions.
+static void
+replaceArgumentUses(OutlinableRegion &Region,
+                    DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
 
+  Function *DominatingFunction = Region.ExtractedFunction;
+  if (FirstFunction)
+    DominatingFunction = Group.OutlinedFunction;
+  DominatorTree DT(*DominatingFunction);
+
   for (unsigned ArgIdx = 0; ArgIdx < Region.ExtractedFunction->arg_size();
        ArgIdx++) {
     assert(Region.ExtractedArgToAgg.find(ArgIdx) !=
@@ -946,11 +1110,53 @@ static void replaceArgumentUses(OutlinableRegion &Region,
     assert(InstAsUser && "User is nullptr!");
 
     Instruction *I = cast<Instruction>(InstAsUser);
-    I->setDebugLoc(DebugLoc());
-    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
-                      << *OutputBB << "\n");
+    BasicBlock *BB = I->getParent();
+    SmallVector<BasicBlock *, 4> Descendants;
+    DT.getDescendants(BB, Descendants);
+    bool EdgeAdded = false;
+    if (Descendants.size() == 0) {
+      EdgeAdded = true;
+      DT.insertEdge(&DominatingFunction->getEntryBlock(), BB);
+      DT.getDescendants(BB, Descendants);
+    }
+
+    // Iterate over the following blocks, looking for return instructions,
+    // if we find one, find the corresponding output block for the return value
+    // and move our store instruction there.
+    for (BasicBlock *DescendBB : Descendants) {
+      ReturnInst *RI = dyn_cast<ReturnInst>(DescendBB->getTerminator());
+      if (!RI)
+        continue;
+      Value *RetVal = RI->getReturnValue();
+      auto VBBIt = OutputBBs.find(RetVal);
+      assert(VBBIt != OutputBBs.end() && "Could not find output value!");
+
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      StoreInst *SI = cast<StoreInst>(I);
+
+      Value *ValueOperand = SI->getValueOperand();
+
+      StoreInst *NewI = cast<StoreInst>(I->clone());
+      NewI->setDebugLoc(DebugLoc());
+      BasicBlock *OutputBB = VBBIt->second;
+      OutputBB->getInstList().push_back(NewI);
+      LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                        << *OutputBB << "\n");
 
-    I->moveBefore(*OutputBB, OutputBB->end());
+      if (FirstFunction)
+        continue;
+      Value *CorrVal =
+          Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+      assert(CorrVal && "Value is nullptr?");
+      NewI->setOperand(0, CorrVal);
+    }
+
+    // If we added an edge for basic blocks without a predecessor, we remove it
+    // here.
+    if (EdgeAdded)
+      DT.deleteEdge(&DominatingFunction->getEntryBlock(), BB);
+    I->eraseFromParent();
 
     LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function "
                       << *Region.ExtractedFunction << " with " << *AggArg
@@ -990,69 +1196,53 @@ void replaceConstants(OutlinableRegion &Region) {
   }
 }
 
-/// For the given function, find all the nondebug or lifetime instructions,
-/// and return them as a vector. Exclude any blocks in \p ExludeBlocks.
-///
-/// \param [in] F - The function we collect the instructions from.
-/// \param [in] ExcludeBlocks - BasicBlocks to ignore.
-/// \returns the list of instructions extracted.
-static std::vector<Instruction *>
-collectRelevantInstructions(Function &F,
-                            DenseSet<BasicBlock *> &ExcludeBlocks) {
-  std::vector<Instruction *> RelevantInstructions;
-
-  for (BasicBlock &BB : F) {
-    if (ExcludeBlocks.contains(&BB))
-      continue;
-
-    for (Instruction &Inst : BB) {
-      if (Inst.isLifetimeStartOrEnd())
-        continue;
-      if (isa<DbgInfoIntrinsic>(Inst))
-        continue;
-
-      RelevantInstructions.push_back(&Inst);
-    }
-  }
-
-  return RelevantInstructions;
-}
-
 /// It is possible that there is a basic block that already performs the same
 /// stores. This returns a duplicate block, if it exists
 ///
-/// \param OutputBB [in] the block we are looking for a duplicate of.
+/// \param OutputBBs [in] the blocks we are looking for a duplicate of.
 /// \param OutputStoreBBs [in] The existing output blocks.
 /// \returns an optional value with the number output block if there is a match.
-Optional<unsigned>
-findDuplicateOutputBlock(BasicBlock *OutputBB,
-                         ArrayRef<BasicBlock *> OutputStoreBBs) {
+Optional<unsigned> findDuplicateOutputBlock(
+    DenseMap<Value *, BasicBlock *> &OutputBBs,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
 
-  bool WrongInst = false;
-  bool WrongSize = false;
+  bool Mismatch = false;
   unsigned MatchingNum = 0;
-  for (BasicBlock *CompBB : OutputStoreBBs) {
-    WrongInst = false;
-    if (CompBB->size() - 1 != OutputBB->size()) {
-      WrongSize = true;
-      MatchingNum++;
-      continue;
-    }
-
-    WrongSize = false;
-    BasicBlock::iterator NIt = OutputBB->begin();
-    for (Instruction &I : *CompBB) {
-      if (isa<BranchInst>(&I))
-        continue;
+  // We compare the new set output blocks to the other sets of output blocks.
+  // If they are the same number, and have identical instructions, they are
+  // considered to be the same.
+  for (DenseMap<Value *, BasicBlock *> &CompBBs : OutputStoreBBs) {
+    Mismatch = false;
+    for (std::pair<Value *, BasicBlock *> &VToB : CompBBs) {
+      DenseMap<Value *, BasicBlock *>::iterator OutputBBIt =
+          OutputBBs.find(VToB.first);
+      if (OutputBBIt == OutputBBs.end()) {
+        Mismatch = true;
+        break;
+      }
 
-      if (!I.isIdenticalTo(&(*NIt))) {
-        WrongInst = true;
+      BasicBlock *CompBB = VToB.second;
+      BasicBlock *OutputBB = OutputBBIt->second;
+      if (CompBB->size() - 1 != OutputBB->size()) {
+        Mismatch = true;
         break;
       }
 
-      NIt++;
+      BasicBlock::iterator NIt = OutputBB->begin();
+      for (Instruction &I : *CompBB) {
+        if (isa<BranchInst>(&I))
+          continue;
+
+        if (!I.isIdenticalTo(&(*NIt))) {
+          Mismatch = true;
+          break;
+        }
+
+        NIt++;
+      }
     }
-    if (!WrongInst && !WrongSize)
+
+    if (!Mismatch)
       return MatchingNum;
 
     MatchingNum++;
@@ -1061,95 +1251,130 @@ findDuplicateOutputBlock(BasicBlock *OutputBB,
   return None;
 }
 
+/// Remove empty output blocks from the outlined region.
+///
+/// \param BlocksToPrune - Mapping of return values output blocks for the \p
+/// Region.
+/// \param Region - The OutlinableRegion we are analyzing.
+static bool
+analyzeAndPruneOutputBlocks(DenseMap<Value *, BasicBlock *> &BlocksToPrune,
+                            OutlinableRegion &Region) {
+  bool AllRemoved = true;
+  Value *RetValueForBB;
+  BasicBlock *NewBB;
+  SmallVector<Value *, 4> ToRemove;
+  // Iterate over the output blocks created in the outlined section.
+  for (std::pair<Value *, BasicBlock *> &VtoBB : BlocksToPrune) {
+    RetValueForBB = VtoBB.first;
+    NewBB = VtoBB.second;
+  
+    // If there are no instructions, we remove it from the module, and also
+    // mark the value for removal from the return value to output block mapping.
+    if (NewBB->size() == 0) {
+      NewBB->eraseFromParent();
+      ToRemove.push_back(RetValueForBB);
+      continue;
+    }
+    
+    // Mark that we could not remove all the blocks since they were not all
+    // empty.
+    AllRemoved = false;
+  }
+
+  // Remove the return value from the mapping.
+  for (Value *V : ToRemove)
+    BlocksToPrune.erase(V);
+
+  // Mark the region as having the no output scheme.
+  if (AllRemoved)
+    Region.OutputBlockNum = -1;
+  
+  return AllRemoved;
+}
+
 /// For the outlined section, move needed the StoreInsts for the output
 /// registers into their own block. Then, determine if there is a duplicate
 /// output block already created.
 ///
 /// \param [in] OG - The OutlinableGroup of regions to be outlined.
 /// \param [in] Region - The OutlinableRegion that is being analyzed.
-/// \param [in,out] OutputBB - the block that stores for this region will be
+/// \param [in,out] OutputBBs - the blocks that stores for this region will be
 /// placed in.
-/// \param [in] EndBB - the final block of the extracted function.
+/// \param [in] EndBBs - the final blocks of the extracted function.
 /// \param [in] OutputMappings - OutputMappings the mapping of values that have
 /// been replaced by a new output value.
 /// \param [in,out] OutputStoreBBs - The existing output blocks.
-static void
-alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
-                            BasicBlock *OutputBB, BasicBlock *EndBB,
-                            const DenseMap<Value *, Value *> &OutputMappings,
-                            std::vector<BasicBlock *> &OutputStoreBBs) {
-  DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
-                                  Region.GVNStores.end());
-
-  // We iterate over the instructions in the extracted function, and find the
-  // global value number of the instructions.  If we find a value that should
-  // be contained in a store, we replace the uses of the value with the value
-  // from the overall function, so that the store is storing the correct
-  // value from the overall function.
-  DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(),
-                                    OutputStoreBBs.end());
-  ExcludeBBs.insert(OutputBB);
-  std::vector<Instruction *> ExtractedFunctionInsts =
-      collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
-  std::vector<Instruction *> OverallFunctionInsts =
-      collectRelevantInstructions(*OG.OutlinedFunction, ExcludeBBs);
-
-  assert(ExtractedFunctionInsts.size() == OverallFunctionInsts.size() &&
-         "Number of relevant instructions not equal!");
-
-  unsigned NumInstructions = ExtractedFunctionInsts.size();
-  for (unsigned Idx = 0; Idx < NumInstructions; Idx++) {
-    Value *V = ExtractedFunctionInsts[Idx];
-
-    if (OutputMappings.find(V) != OutputMappings.end())
-      V = OutputMappings.find(V)->second;
-    Optional<unsigned> GVN = Region.Candidate->getGVN(V);
-
-    // If we have found one of the stored values for output, replace the value
-    // with the corresponding one from the overall function.
-    if (GVN.hasValue() && ValuesToFind.erase(GVN.getValue())) {
-      V->replaceAllUsesWith(OverallFunctionInsts[Idx]);
-      if (ValuesToFind.size() == 0)
-        break;
-    }
-
-    if (ValuesToFind.size() == 0)
-      break;
-  }
-
-  assert(ValuesToFind.size() == 0 && "Not all store values were handled!");
-
-  // If the size of the block is 0, then there are no stores, and we do not
-  // need to save this block.
-  if (OutputBB->size() == 0) {
-    Region.OutputBlockNum = -1;
-    OutputBB->eraseFromParent();
+static void alignOutputBlockWithAggFunc(
+    OutlinableGroup &OG, OutlinableRegion &Region,
+    DenseMap<Value *, BasicBlock *> &OutputBBs,
+    DenseMap<Value *, BasicBlock *> &EndBBs,
+    const DenseMap<Value *, Value *> &OutputMappings,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
+  // If none of the output blocks have any instructions, this means that we do
+  // not have to determine if it matches any of the other output schemes, and we
+  // don't have to do anything else.
+  if (analyzeAndPruneOutputBlocks(OutputBBs, Region))
     return;
-  }
 
-  // Determine is there is a duplicate block.
+  // Determine is there is a duplicate set of blocks.
   Optional<unsigned> MatchingBB =
-      findDuplicateOutputBlock(OutputBB, OutputStoreBBs);
+      findDuplicateOutputBlock(OutputBBs, OutputStoreBBs);
 
-  // If there is, we remove the new output block.  If it does not,
-  // we add it to our list of output blocks.
+  // If there is, we remove the new output blocks.  If it does not,
+  // we add it to our list of sets of output blocks.
   if (MatchingBB.hasValue()) {
     LLVM_DEBUG(dbgs() << "Set output block for region in function"
                       << Region.ExtractedFunction << " to "
                       << MatchingBB.getValue());
 
     Region.OutputBlockNum = MatchingBB.getValue();
-    OutputBB->eraseFromParent();
+    for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs)
+      VtoBB.second->eraseFromParent();
     return;
   }
 
   Region.OutputBlockNum = OutputStoreBBs.size();
 
-  LLVM_DEBUG(dbgs() << "Create output block for region in"
-                    << Region.ExtractedFunction << " to "
-                    << *OutputBB);
-  OutputStoreBBs.push_back(OutputBB);
-  BranchInst::Create(EndBB, OutputBB);
+  Value *RetValueForBB;
+  BasicBlock *NewBB;
+  OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>());
+  for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) {
+    RetValueForBB = VtoBB.first;
+    NewBB = VtoBB.second;
+    DenseMap<Value *, BasicBlock *>::iterator VBBIt =
+        EndBBs.find(RetValueForBB);
+    LLVM_DEBUG(dbgs() << "Create output block for region in"
+                      << Region.ExtractedFunction << " to "
+                      << *NewBB);
+    BranchInst::Create(VBBIt->second, NewBB);
+    OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB));
+  }
+}
+
+/// Takes in a mapping, \p OldMap of ConstantValues to BasicBlocks, sorts keys,
+/// before creating a basic block for each \p NewMap, and inserting into the new
+/// block. Each BasicBlock is named with the scheme "<basename>_<key_idx>".
+///
+/// \param OldMap [in] - The mapping to base the new mapping off of.
+/// \param NewMap [out] - The output mapping using the keys of \p OldMap.
+/// \param ParentFunc [in] - The function to put the new basic block in.
+/// \param BaseName [in] - The start of the BasicBlock names to be appended to
+/// by an index value.
+static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap,
+                                       DenseMap<Value *, BasicBlock *> &NewMap,
+                                       Function *ParentFunc, Twine BaseName) {
+  unsigned Idx = 0;
+  std::vector<Value *> SortedKeys;
+  
+  getSortedConstantKeys(SortedKeys, OldMap);
+
+  for (Value *RetVal : SortedKeys) {
+    BasicBlock *NewBB = BasicBlock::Create(
+        ParentFunc->getContext(),
+        Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)),
+        ParentFunc);
+    NewMap.insert(std::make_pair(RetVal, NewBB));
+  }
 }
 
 /// Create the switch statement for outlined function to differentiate between
@@ -1159,50 +1384,74 @@ alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
 /// matches the needed stores for the extracted section.
 /// \param [in] M - The module we are outlining from.
 /// \param [in] OG - The group of regions to be outlined.
-/// \param [in] EndBB - The final block of the extracted function.
+/// \param [in] EndBBs - The final blocks of the extracted function.
 /// \param [in,out] OutputStoreBBs - The existing output blocks.
-void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
-                           ArrayRef<BasicBlock *> OutputStoreBBs) {
+void createSwitchStatement(
+    Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
   // combination.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
-    // Create a final block
-    BasicBlock *ReturnBlock =
-        BasicBlock::Create(M.getContext(), "final_block", AggFunc);
-    Instruction *Term = EndBB->getTerminator();
-    Term->moveBefore(*ReturnBlock, ReturnBlock->end());
-    // Put the switch statement in the old end basic block for the function with
-    // a fall through to the new return block
-    LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
-                      << OutputStoreBBs.size() << "\n");
-    SwitchInst *SwitchI =
-        SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
-                           ReturnBlock, OutputStoreBBs.size(), EndBB);
-
-    unsigned Idx = 0;
-    for (BasicBlock *BB : OutputStoreBBs) {
-      SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx),
-                       BB);
-      Term = BB->getTerminator();
-      Term->setSuccessor(0, ReturnBlock);
-      Idx++;
+    // Create a final block for each different return block.
+    DenseMap<Value *, BasicBlock *> ReturnBBs;
+    createAndInsertBasicBlocks(OG.EndBBs, ReturnBBs, AggFunc, "final_block");
+
+    for (std::pair<Value *, BasicBlock *> &RetBlockPair : ReturnBBs) {
+      std::pair<Value *, BasicBlock *> &OutputBlock =
+          *OG.EndBBs.find(RetBlockPair.first);
+      BasicBlock *ReturnBlock = RetBlockPair.second;
+      BasicBlock *EndBB = OutputBlock.second;
+      Instruction *Term = EndBB->getTerminator();
+      // Move the return value to the final block instead of the original exit
+      // stub.
+      Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+      // Put the switch statement in the old end basic block for the function
+      // with a fall through to the new return block.
+      LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                        << OutputStoreBBs.size() << "\n");
+      SwitchInst *SwitchI =
+          SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
+                             ReturnBlock, OutputStoreBBs.size(), EndBB);
+
+      unsigned Idx = 0;
+      for (DenseMap<Value *, BasicBlock *> &OutputStoreBB : OutputStoreBBs) {
+        DenseMap<Value *, BasicBlock *>::iterator OSBBIt =
+            OutputStoreBB.find(OutputBlock.first);
+
+        if (OSBBIt == OutputStoreBB.end())
+          continue;
+
+        BasicBlock *BB = OSBBIt->second;
+        SwitchI->addCase(
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), BB);
+        Term = BB->getTerminator();
+        Term->setSuccessor(0, ReturnBlock);
+        Idx++;
+      }
     }
     return;
   }
 
-  // If there needs to be stores, move them from the output block to the end
-  // block to save on branching instructions.
+  // If there needs to be stores, move them from the output blocks to their
+  // corresponding ending block.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
-    BasicBlock *OutputBlock = OutputStoreBBs[0];
-    Instruction *Term = OutputBlock->getTerminator();
-    Term->eraseFromParent();
-    Term = EndBB->getTerminator();
-    moveBBContents(*OutputBlock, *EndBB);
-    Term->moveBefore(*EndBB, EndBB->end());
-    OutputBlock->eraseFromParent();
+    DenseMap<Value *, BasicBlock *> OutputBlocks = OutputStoreBBs[0];
+    for (std::pair<Value *, BasicBlock *> &VBPair : OutputBlocks) {
+      DenseMap<Value *, BasicBlock *>::iterator EndBBIt =
+          EndBBs.find(VBPair.first);
+      assert(EndBBIt != EndBBs.end() && "Could not find end block");
+      BasicBlock *EndBB = EndBBIt->second;
+      BasicBlock *OutputBB = VBPair.second;
+      Instruction *Term = OutputBB->getTerminator();
+      Term->eraseFromParent();
+      Term = EndBB->getTerminator();
+      moveBBContents(*OutputBB, *EndBB);
+      Term->moveBefore(*EndBB, EndBB->end());
+      OutputBB->eraseFromParent();
+    }
   }
 }
 
@@ -1217,42 +1466,44 @@ void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
 /// set of stores needed for the different functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
-static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
-                                std::vector<BasicBlock *> &OutputStoreBBs,
-                                std::vector<Function *> &FuncsToRemove) {
+static void fillOverallFunction(
+    Module &M, OutlinableGroup &CurrentGroup,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
+    std::vector<Function *> &FuncsToRemove) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
   LLVM_DEBUG(dbgs() << "Move instructions from "
                     << *CurrentOS->ExtractedFunction << " to instruction "
                     << *CurrentGroup.OutlinedFunction << "\n");
-
-  CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction,
-                                        *CurrentGroup.OutlinedFunction);
+  moveFunctionData(*CurrentOS->ExtractedFunction,
+                   *CurrentGroup.OutlinedFunction, CurrentGroup.EndBBs);
 
   // Transfer the attributes from the function to the new function.
-  for (Attribute A :
-       CurrentOS->ExtractedFunction->getAttributes().getFnAttributes())
+  for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttrs())
     CurrentGroup.OutlinedFunction->addFnAttr(A);
 
-  // Create an output block for the first extracted function.
-  BasicBlock *NewBB = BasicBlock::Create(
-      M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)),
-      CurrentGroup.OutlinedFunction);
+  // Create a new set of output blocks for the first extracted function.
+  DenseMap<Value *, BasicBlock *> NewBBs;
+  createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs,
+                             CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBB);
+  replaceArgumentUses(*CurrentOS, NewBBs, true);
   replaceConstants(*CurrentOS);
 
-  // If the new basic block has no new stores, we can erase it from the module.
-  // It it does, we create a branch instruction to the last basic block from the
-  // new one.
-  if (NewBB->size() == 0) {
-    CurrentOS->OutputBlockNum = -1;
-    NewBB->eraseFromParent();
-  } else {
-    BranchInst::Create(CurrentGroup.EndBB, NewBB);
-    OutputStoreBBs.push_back(NewBB);
+  // We first identify if any output blocks are empty, if they are we remove
+  // them. We then create a branch instruction to the basic block to the return
+  // block for the function for each non empty output block.
+  if (!analyzeAndPruneOutputBlocks(NewBBs, *CurrentOS)) {
+    OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>());
+    for (std::pair<Value *, BasicBlock *> &VToBB : NewBBs) {
+      DenseMap<Value *, BasicBlock *>::iterator VBBIt =
+          CurrentGroup.EndBBs.find(VToBB.first);
+      BasicBlock *EndBB = VBBIt->second;
+      BranchInst::Create(EndBB, VToBB.second);
+      OutputStoreBBs.back().insert(VToBB);
+    }
   }
 
   // Replace the call to the extracted function with the outlined function.
@@ -1268,25 +1519,28 @@ void IROutliner::deduplicateExtractedSections(
     std::vector<Function *> &FuncsToRemove, unsigned &OutlinedFunctionNum) {
   createFunction(M, CurrentGroup, OutlinedFunctionNum);
 
-  std::vector<BasicBlock *> OutputStoreBBs;
+  std::vector<DenseMap<Value *, BasicBlock *>> OutputStoreBBs;
 
   OutlinableRegion *CurrentOS;
 
   fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
 
+  std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
     CurrentOS = CurrentGroup.Regions[Idx];
     AttributeFuncs::mergeAttributesForOutlining(*CurrentGroup.OutlinedFunction,
                                                *CurrentOS->ExtractedFunction);
 
-    // Create a new BasicBlock to hold the needed store instructions.
-    BasicBlock *NewBB = BasicBlock::Create(
-        M.getContext(), "output_block_" + std::to_string(Idx),
-        CurrentGroup.OutlinedFunction);
-    replaceArgumentUses(*CurrentOS, NewBB);
+    // Create a set of BasicBlocks, one for each return block, to hold the
+    // needed store instructions.
+    DenseMap<Value *, BasicBlock *> NewBBs;
+    createAndInsertBasicBlocks(
+        CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
+        "output_block_" + Twine(static_cast<unsigned>(Idx)));
 
-    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB,
-                                CurrentGroup.EndBB, OutputMappings,
+    replaceArgumentUses(*CurrentOS, NewBBs);
+    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
+                                CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
 
     CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
@@ -1294,11 +1548,78 @@ void IROutliner::deduplicateExtractedSections(
   }
 
   // Create a switch statement to handle the different output schemes.
-  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs);
+  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBBs, OutputStoreBBs);
 
   OutlinedFunctionNum++;
 }
 
+/// Checks that the next instruction in the InstructionDataList matches the
+/// next instruction in the module.  If they do not, there could be the
+/// possibility that extra code has been inserted, and we must ignore it.
+///
+/// \param ID - The IRInstructionData to check the next instruction of.
+/// \returns true if the InstructionDataList and actual instruction match.
+static bool nextIRInstructionDataMatchesNextInst(IRInstructionData &ID) {
+  // We check if there is a discrepancy between the InstructionDataList
+  // and the actual next instruction in the module.  If there is, it means
+  // that an extra instruction was added, likely by the CodeExtractor.
+
+  // Since we do not have any similarity data about this particular
+  // instruction, we cannot confidently outline it, and must discard this
+  // candidate.
+  IRInstructionDataList::iterator NextIDIt = std::next(ID.getIterator());
+  Instruction *NextIDLInst = NextIDIt->Inst;
+  Instruction *NextModuleInst = nullptr;
+  if (!ID.Inst->isTerminator())
+    NextModuleInst = ID.Inst->getNextNonDebugInstruction();
+  else if (NextIDLInst != nullptr)
+    NextModuleInst =
+        &*NextIDIt->Inst->getParent()->instructionsWithoutDebug().begin();
+
+  if (NextIDLInst && NextIDLInst != NextModuleInst)
+    return false;
+
+  return true;
+}
+
+bool IROutliner::isCompatibleWithAlreadyOutlinedCode(
+    const OutlinableRegion &Region) {
+  IRSimilarityCandidate *IRSC = Region.Candidate;
+  unsigned StartIdx = IRSC->getStartIdx();
+  unsigned EndIdx = IRSC->getEndIdx();
+
+  // A check to make sure that we are not about to attempt to outline something
+  // that has already been outlined.
+  for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++)
+    if (Outlined.contains(Idx))
+      return false;
+
+  // We check if the recorded instruction matches the actual next instruction,
+  // if it does not, we fix it in the InstructionDataList.
+  if (!Region.Candidate->backInstruction()->isTerminator()) {
+    Instruction *NewEndInst =
+        Region.Candidate->backInstruction()->getNextNonDebugInstruction();
+    assert(NewEndInst && "Next instruction is a nullptr?");
+    if (Region.Candidate->end()->Inst != NewEndInst) {
+      IRInstructionDataList *IDL = Region.Candidate->front()->IDL;
+      IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate())
+          IRInstructionData(*NewEndInst,
+                            InstructionClassifier.visit(*NewEndInst), *IDL);
+
+      // Insert the first IRInstructionData of the new region after the
+      // last IRInstructionData of the IRSimilarityCandidate.
+      IDL->insert(Region.Candidate->end(), *NewEndIRID);
+    }
+  }
+
+  return none_of(*IRSC, [this](IRInstructionData &ID) {
+    if (!nextIRInstructionDataMatchesNextInst(ID))
+      return true;
+
+    return !this->InstructionClassifier.visit(ID.Inst);
+  });
+}
+
 void IROutliner::pruneIncompatibleRegions(
     std::vector<IRSimilarityCandidate> &CandidateVec,
     OutlinableGroup &CurrentGroup) {
@@ -1310,6 +1631,15 @@ void IROutliner::pruneIncompatibleRegions(
     return LHS.getStartIdx() < RHS.getStartIdx();
   });
 
+  IRSimilarityCandidate &FirstCandidate = CandidateVec[0];
+  // Since outlining a call and a branch instruction will be the same as only
+  // outlinining a call instruction, we ignore it as a space saving.
+  if (FirstCandidate.getLength() == 2) {
+    if (isa<CallInst>(FirstCandidate.front()->Inst) &&
+        isa<BranchInst>(FirstCandidate.back()->Inst))
+        return;
+  }
+
   unsigned CurrentEndIdx = 0;
   for (IRSimilarityCandidate &IRSC : CandidateVec) {
     PreviouslyOutlined = false;
@@ -1325,9 +1655,13 @@ void IROutliner::pruneIncompatibleRegions(
     if (PreviouslyOutlined)
       continue;
 
-    // TODO: If in the future we can outline across BasicBlocks, we will need to
-    // check all BasicBlocks contained in the region.
-    if (IRSC.getStartBB()->hasAddressTaken())
+    // Check over the instructions, and if the basic block has its address
+    // taken for use somewhere else, we do not outline that block.
+    bool BBHasAddressTaken = any_of(IRSC, [](IRInstructionData &ID){
+      return ID.Inst->getParent()->hasAddressTaken();
+    });
+
+    if (BBHasAddressTaken)
       continue;
 
     if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() &&
@@ -1340,16 +1674,9 @@ void IROutliner::pruneIncompatibleRegions(
       continue;
 
     bool BadInst = any_of(IRSC, [this](IRInstructionData &ID) {
-      // We check if there is a discrepancy between the InstructionDataList
-      // and the actual next instruction in the module.  If there is, it means
-      // that an extra instruction was added, likely by the CodeExtractor.
-
-      // Since we do not have any similarity data about this particular
-      // instruction, we cannot confidently outline it, and must discard this
-      // candidate.
-      if (std::next(ID.getIterator())->Inst !=
-          ID.Inst->getNextNonDebugInstruction())
+      if (!nextIRInstructionDataMatchesNextInst(ID))
         return true;
+
       return !this->InstructionClassifier.visit(ID.Inst);
     });
 
@@ -1416,10 +1743,33 @@ static InstructionCost findCostForOutputBlocks(Module &M,
                                                OutlinableGroup &CurrentGroup,
                                                TargetTransformInfo &TTI) {
   InstructionCost OutputCost = 0;
+  unsigned NumOutputBranches = 0;
+
+  IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
+  DenseSet<BasicBlock *> CandidateBlocks;
+  Candidate.getBasicBlocks(CandidateBlocks);
+
+  // Count the number of different output branches that point to blocks outside
+  // of the region.
+  DenseSet<BasicBlock *> FoundBlocks;
+  for (IRInstructionData &ID : Candidate) {
+    if (!isa<BranchInst>(ID.Inst))
+      continue;
+
+    for (Value *V : ID.OperVals) {
+      BasicBlock *BB = static_cast<BasicBlock *>(V);
+      DenseSet<BasicBlock *>::iterator CBIt = CandidateBlocks.find(BB);
+      if (CBIt != CandidateBlocks.end() || FoundBlocks.contains(BB))
+        continue;
+      FoundBlocks.insert(BB);
+      NumOutputBranches++;
+    }
+  }
+
+  CurrentGroup.BranchesToOutside = NumOutputBranches;
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
     for (unsigned GVN : OutputUse) {
       Optional<Value *> OV = Candidate.fromGVN(GVN);
       assert(OV.hasValue() && "Could not find value for GVN?");
@@ -1434,14 +1784,14 @@ static InstructionCost findCostForOutputBlocks(Module &M,
       LLVM_DEBUG(dbgs() << "Adding: " << StoreCost
                         << " instructions to cost for output of type "
                         << *V->getType() << "\n");
-      OutputCost += StoreCost;
+      OutputCost += StoreCost * NumOutputBranches;
     }
 
     InstructionCost BranchCost =
         TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
     LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for"
                       << " a branch instruction\n");
-    OutputCost += BranchCost;
+    OutputCost += BranchCost * NumOutputBranches;
   }
 
   // If there is more than one output scheme, we must have a comparison and
@@ -1460,7 +1810,7 @@ static InstructionCost findCostForOutputBlocks(Module &M,
     LLVM_DEBUG(dbgs() << "Adding: " << TotalCost
                       << " instructions for each switch case for each different"
                       << " output path in a function\n");
-    OutputCost += TotalCost;
+    OutputCost += TotalCost * NumOutputBranches;
   }
 
   return OutputCost;
@@ -1548,13 +1898,12 @@ void IROutliner::updateOutputMapping(OutlinableRegion &Region,
 
 bool IROutliner::extractSection(OutlinableRegion &Region) {
   SetVector<Value *> ArgInputs, Outputs, SinkCands;
-  Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
-
   assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!");
-  assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!");
+  BasicBlock *InitialStart = Region.StartBB;
   Function *OrigF = Region.StartBB->getParent();
   CodeExtractorAnalysisCache CEAC(*OrigF);
-  Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC);
+  Region.ExtractedFunction =
+      Region.CE->extractCodeRegion(CEAC, ArgInputs, Outputs);
 
   // If the extraction was successful, find the BasicBlock, and reassign the
   // OutlinableRegion blocks
@@ -1565,7 +1914,23 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
     return false;
   }
 
-  BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor();
+  // Get the block containing the called branch, and reassign the blocks as
+  // necessary.  If the original block still exists, it is because we ended on
+  // a branch instruction, and so we move the contents into the block before
+  // and assign the previous block correctly.
+  User *InstAsUser = Region.ExtractedFunction->user_back();
+  BasicBlock *RewrittenBB = cast<Instruction>(InstAsUser)->getParent();
+  Region.PrevBB = RewrittenBB->getSinglePredecessor();
+  assert(Region.PrevBB && "PrevBB is nullptr?");
+  if (Region.PrevBB == InitialStart) {
+    BasicBlock *NewPrev = InitialStart->getSinglePredecessor();
+    Instruction *BI = NewPrev->getTerminator();
+    BI->eraseFromParent();
+    moveBBContents(*InitialStart, *NewPrev);
+    Region.PrevBB = NewPrev;
+    InitialStart->eraseFromParent();
+  }
+
   Region.StartBB = RewrittenBB;
   Region.EndBB = RewrittenBB;
 
@@ -1608,6 +1973,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
 
 unsigned IROutliner::doOutline(Module &M) {
   // Find the possible similarity sections.
+  InstructionClassifier.EnableBranches = !DisableBranches;
   IRSimilarityIdentifier &Identifier = getIRSI(M);
   SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
 
@@ -1622,12 +1988,17 @@ unsigned IROutliner::doOutline(Module &M) {
                         return LHS[0].getLength() * LHS.size() >
                                RHS[0].getLength() * RHS.size();
                       });
+  // Creating OutlinableGroups for each SimilarityCandidate to be used in
+  // each of the following for loops to avoid making an allocator.
+  std::vector<OutlinableGroup> PotentialGroups(SimilarityCandidates.size());
 
   DenseSet<unsigned> NotSame;
-  std::vector<Function *> FuncsToRemove;
+  std::vector<OutlinableGroup *> NegativeCostGroups;
+  std::vector<OutlinableRegion *> OutlinedRegions;
   // Iterate over the possible sets of similarity.
+  unsigned PotentialGroupIdx = 0;
   for (SimilarityGroup &CandidateVec : SimilarityCandidates) {
-    OutlinableGroup CurrentGroup;
+    OutlinableGroup &CurrentGroup = PotentialGroups[PotentialGroupIdx++];
 
     // Remove entries that were previously outlined
     pruneIncompatibleRegions(CandidateVec, CurrentGroup);
@@ -1649,20 +2020,31 @@ unsigned IROutliner::doOutline(Module &M) {
     // Create a CodeExtractor for each outlinable region. Identify inputs and
     // outputs for each section using the code extractor and create the argument
     // types for the Aggregate Outlining Function.
-    std::vector<OutlinableRegion *> OutlinedRegions;
+    OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
       // Break the outlinable region out of its parent BasicBlock into its own
       // BasicBlocks (see function implementation).
       OS->splitCandidate();
-      std::vector<BasicBlock *> BE = {OS->StartBB};
+
+      // There's a chance that when the region is split, extra instructions are
+      // added to the region. This makes the region no longer viable
+      // to be split, so we ignore it for outlining.
+      if (!OS->CandidateSplit)
+        continue;
+
+      SmallVector<BasicBlock *> BE;
+      DenseSet<BasicBlock *> BBSet;
+      OS->Candidate->getBasicBlocks(BBSet, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
       findAddInputsOutputs(M, *OS, NotSame);
       if (!OS->IgnoreRegion)
         OutlinedRegions.push_back(OS);
-      else
-        OS->reattachCandidate();
+
+      // We recombine the blocks together now that we have gathered all the
+      // needed information.
+      OS->reattachCandidate();
     }
 
     CurrentGroup.Regions = std::move(OutlinedRegions);
@@ -1675,12 +2057,11 @@ unsigned IROutliner::doOutline(Module &M) {
     if (CostModel)
       findCostBenefit(M, CurrentGroup);
 
-    // If we are adhering to the cost model, reattach all the candidates
+    // If we are adhering to the cost model, skip those groups where the cost
+    // outweighs the benefits.
     if (CurrentGroup.Cost >= CurrentGroup.Benefit && CostModel) {
-      for (OutlinableRegion *OS : CurrentGroup.Regions)
-        OS->reattachCandidate();
-      OptimizationRemarkEmitter &ORE = getORE(
-          *CurrentGroup.Regions[0]->Candidate->getFunction());
+      OptimizationRemarkEmitter &ORE =
+          getORE(*CurrentGroup.Regions[0]->Candidate->getFunction());
       ORE.emit([&]() {
         IRSimilarityCandidate *C = CurrentGroup.Regions[0]->Candidate;
         OptimizationRemarkMissed R(DEBUG_TYPE, "WouldNotDecreaseSize",
@@ -1704,12 +2085,70 @@ unsigned IROutliner::doOutline(Module &M) {
       continue;
     }
 
+    NegativeCostGroups.push_back(&CurrentGroup);
+  }
+
+  ExtractorAllocator.DestroyAll();
+
+  if (NegativeCostGroups.size() > 1)
+    stable_sort(NegativeCostGroups,
+                [](const OutlinableGroup *LHS, const OutlinableGroup *RHS) {
+                  return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost;
+                });
+
+  std::vector<Function *> FuncsToRemove;
+  for (OutlinableGroup *CG : NegativeCostGroups) {
+    OutlinableGroup &CurrentGroup = *CG;
+
+    OutlinedRegions.clear();
+    for (OutlinableRegion *Region : CurrentGroup.Regions) {
+      // We check whether our region is compatible with what has already been
+      // outlined, and whether we need to ignore this item.
+      if (!isCompatibleWithAlreadyOutlinedCode(*Region))
+        continue;
+      OutlinedRegions.push_back(Region);
+    }
+
+    if (OutlinedRegions.size() < 2)
+      continue;
+
+    // Reestimate the cost and benefit of the OutlinableGroup. Continue only if
+    // we are still outlining enough regions to make up for the added cost.
+    CurrentGroup.Regions = std::move(OutlinedRegions);
+    if (CostModel) {
+      CurrentGroup.Benefit = 0;
+      CurrentGroup.Cost = 0;
+      findCostBenefit(M, CurrentGroup);
+      if (CurrentGroup.Cost >= CurrentGroup.Benefit)
+        continue;
+    }
+    OutlinedRegions.clear();
+    for (OutlinableRegion *Region : CurrentGroup.Regions) {
+      Region->splitCandidate();
+      if (!Region->CandidateSplit)
+        continue;
+      OutlinedRegions.push_back(Region);
+    }
+
+    CurrentGroup.Regions = std::move(OutlinedRegions);
+    if (CurrentGroup.Regions.size() < 2) {
+      for (OutlinableRegion *R : CurrentGroup.Regions)
+        R->reattachCandidate();
+      continue;
+    }
+
     LLVM_DEBUG(dbgs() << "Outlining regions with cost " << CurrentGroup.Cost
                       << " and benefit " << CurrentGroup.Benefit << "\n");
 
     // Create functions out of all the sections, and mark them as outlined.
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
+      SmallVector<BasicBlock *> BE;
+      DenseSet<BasicBlock *> BBSet;
+      OS->Candidate->getBasicBlocks(BBSet, BE);
+      OS->CE = new (ExtractorAllocator.Allocate())
+          CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
+                        false, "outlined");
       bool FunctionOutlined = extractSection(*OS);
       if (FunctionOutlined) {
         unsigned StartIdx = OS->Candidate->getStartIdx();
@@ -1767,6 +2206,7 @@ bool IROutliner::run(Module &M) {
 }
 
 // Pass Manager Boilerplate
+namespace {
 class IROutlinerLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -1782,6 +2222,7 @@ public:
 
   bool runOnModule(Module &M) override;
 };
+} // namespace
 
 bool IROutlinerLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
index 59260af88832..992c2b292e1e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -31,9 +31,11 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InlineOrder.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
@@ -96,9 +98,53 @@ static cl::opt<std::string> CGSCCInlineReplayFile(
     "cgscc-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
         "Optimization remarks file containing inline remarks to be replayed "
-        "by inlining from cgscc inline remarks."),
+        "by cgscc inlining."),
     cl::Hidden);
 
+static cl::opt<ReplayInlinerSettings::Scope> CGSCCInlineReplayScope(
+    "cgscc-inline-replay-scope",
+    cl::init(ReplayInlinerSettings::Scope::Function),
+    cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
+                          "Replay on functions that have remarks associated "
+                          "with them (default)"),
+               clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
+                          "Replay on the entire module")),
+    cl::desc("Whether inline replay should be applied to the entire "
+             "Module or just the Functions (default) that are present as "
+             "callers in remarks during cgscc inlining."),
+    cl::Hidden);
+
+static cl::opt<ReplayInlinerSettings::Fallback> CGSCCInlineReplayFallback(
+    "cgscc-inline-replay-fallback",
+    cl::init(ReplayInlinerSettings::Fallback::Original),
+    cl::values(
+        clEnumValN(
+            ReplayInlinerSettings::Fallback::Original, "Original",
+            "All decisions not in replay send to original advisor (default)"),
+        clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
+                   "AlwaysInline", "All decisions not in replay are inlined"),
+        clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
+                   "All decisions not in replay are not inlined")),
+    cl::desc(
+        "How cgscc inline replay treats sites that don't come from the replay. "
+        "Original: defers to original advisor, AlwaysInline: inline all sites "
+        "not in replay, NeverInline: inline no sites not in replay"),
+    cl::Hidden);
+
+static cl::opt<CallSiteFormat::Format> CGSCCInlineReplayFormat(
+    "cgscc-inline-replay-format",
+    cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
+    cl::values(
+        clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
+        clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
+                   "<Line Number>:<Column Number>"),
+        clEnumValN(CallSiteFormat::Format::LineDiscriminator,
+                   "LineDiscriminator", "<Line Number>.<Discriminator>"),
+        clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
+                   "LineColumnDiscriminator",
+                   "<Line Number>:<Column Number>.<Discriminator> (default)")),
+    cl::desc("How cgscc inline replay file is formatted"), cl::Hidden);
+
 static cl::opt<bool> InlineEnablePriorityOrder(
     "inline-enable-priority-order", cl::Hidden, cl::init(false),
     cl::desc("Enable the priority inline order for the inliner"));
@@ -463,7 +509,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         }
         ++NumInlined;
 
-        emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+        emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC);
 
         // If inlining this function gave us any new call sites, throw them
         // onto our worklist to process.  They are useful inline candidates.
@@ -661,9 +707,12 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
         std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams());
 
     if (!CGSCCInlineReplayFile.empty())
-      OwnedAdvisor = std::make_unique<ReplayInlineAdvisor>(
+      OwnedAdvisor = getReplayInlineAdvisor(
           M, FAM, M.getContext(), std::move(OwnedAdvisor),
-          CGSCCInlineReplayFile,
+          ReplayInlinerSettings{CGSCCInlineReplayFile,
+                                CGSCCInlineReplayScope,
+                                CGSCCInlineReplayFallback,
+                                {CGSCCInlineReplayFormat}},
           /*EmitRemarks=*/true);
 
     return *OwnedAdvisor;
@@ -674,153 +723,6 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
   return *IAA->getAdvisor();
 }
 
-template <typename T> class InlineOrder {
-public:
-  using reference = T &;
-  using const_reference = const T &;
-
-  virtual ~InlineOrder() {}
-
-  virtual size_t size() = 0;
-
-  virtual void push(const T &Elt) = 0;
-
-  virtual T pop() = 0;
-
-  virtual const_reference front() = 0;
-
-  virtual void erase_if(function_ref<bool(T)> Pred) = 0;
-
-  bool empty() { return !size(); }
-};
-
-template <typename T, typename Container = SmallVector<T, 16>>
-class DefaultInlineOrder : public InlineOrder<T> {
-  using reference = T &;
-  using const_reference = const T &;
-
-public:
-  size_t size() override { return Calls.size() - FirstIndex; }
-
-  void push(const T &Elt) override { Calls.push_back(Elt); }
-
-  T pop() override {
-    assert(size() > 0);
-    return Calls[FirstIndex++];
-  }
-
-  const_reference front() override {
-    assert(size() > 0);
-    return Calls[FirstIndex];
-  }
-
-  void erase_if(function_ref<bool(T)> Pred) override {
-    Calls.erase(std::remove_if(Calls.begin() + FirstIndex, Calls.end(), Pred),
-                Calls.end());
-  }
-
-private:
-  Container Calls;
-  size_t FirstIndex = 0;
-};
-
-class Priority {
-public:
-  Priority(int Size) : Size(Size) {}
-
-  static bool isMoreDesirable(const Priority &S1, const Priority &S2) {
-    return S1.Size < S2.Size;
-  }
-
-  static Priority evaluate(CallBase *CB) {
-    Function *Callee = CB->getCalledFunction();
-    return Priority(Callee->getInstructionCount());
-  }
-
-  int Size;
-};
-
-template <typename PriorityT>
-class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
-  using T = std::pair<CallBase *, int>;
-  using HeapT = std::pair<CallBase *, PriorityT>;
-  using reference = T &;
-  using const_reference = const T &;
-
-  static bool cmp(const HeapT &P1, const HeapT &P2) {
-    return PriorityT::isMoreDesirable(P2.second, P1.second);
-  }
-
-  // A call site could become less desirable for inlining because of the size
-  // growth from prior inlining into the callee. This method is used to lazily
-  // update the desirability of a call site if it's decreasing. It is only
-  // called on pop() or front(), not every time the desirability changes. When
-  // the desirability of the front call site decreases, an updated one would be
-  // pushed right back into the heap. For simplicity, those cases where
-  // the desirability of a call site increases are ignored here.
-  void adjust() {
-    bool Changed = false;
-    do {
-      CallBase *CB = Heap.front().first;
-      const PriorityT PreviousGoodness = Heap.front().second;
-      const PriorityT CurrentGoodness = PriorityT::evaluate(CB);
-      Changed = PriorityT::isMoreDesirable(PreviousGoodness, CurrentGoodness);
-      if (Changed) {
-        std::pop_heap(Heap.begin(), Heap.end(), cmp);
-        Heap.pop_back();
-        Heap.push_back({CB, CurrentGoodness});
-        std::push_heap(Heap.begin(), Heap.end(), cmp);
-      }
-    } while (Changed);
-  }
-
-public:
-  size_t size() override { return Heap.size(); }
-
-  void push(const T &Elt) override {
-    CallBase *CB = Elt.first;
-    const int InlineHistoryID = Elt.second;
-    const PriorityT Goodness = PriorityT::evaluate(CB);
-
-    Heap.push_back({CB, Goodness});
-    std::push_heap(Heap.begin(), Heap.end(), cmp);
-    InlineHistoryMap[CB] = InlineHistoryID;
-  }
-
-  T pop() override {
-    assert(size() > 0);
-    adjust();
-
-    CallBase *CB = Heap.front().first;
-    T Result = std::make_pair(CB, InlineHistoryMap[CB]);
-    InlineHistoryMap.erase(CB);
-    std::pop_heap(Heap.begin(), Heap.end(), cmp);
-    Heap.pop_back();
-    return Result;
-  }
-
-  const_reference front() override {
-    assert(size() > 0);
-    adjust();
-
-    CallBase *CB = Heap.front().first;
-    return *InlineHistoryMap.find(CB);
-  }
-
-  void erase_if(function_ref<bool(T)> Pred) override {
-    auto PredWrapper = [=](HeapT P) -> bool {
-      return Pred(std::make_pair(P.first, 0));
-    };
-    Heap.erase(std::remove_if(Heap.begin(), Heap.end(), PredWrapper),
-               Heap.end());
-    std::make_heap(Heap.begin(), Heap.end(), cmp);
-  }
-
-private:
-  SmallVector<HeapT, 16> Heap;
-  DenseMap<CallBase *, int> InlineHistoryMap;
-};
-
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
@@ -868,7 +770,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // incrementally maknig a single function grow in a super linear fashion.
   std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls;
   if (InlineEnablePriorityOrder)
-    Calls = std::make_unique<PriorityInlineOrder<Priority>>();
+    Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>();
   else
     Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>();
   assert(Calls != nullptr && "Expected an initialized InlineOrder");
@@ -972,8 +874,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         continue;
       }
 
-      auto Advice = Advisor.getAdvice(*CB, OnlyMandatory);
+      std::unique_ptr<InlineAdvice> Advice =
+          Advisor.getAdvice(*CB, OnlyMandatory);
+
       // Check whether we want to inline this callsite.
+      if (!Advice)
+        continue;
+
       if (!Advice->isInliningRecommended()) {
         Advice->recordUnattemptedInlining();
         continue;
@@ -1104,6 +1011,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       UR.InlinedInternalEdges.insert({&N, OldC});
     }
     InlinedCallees.clear();
+
+    // Invalidate analyses for this function now so that we don't have to
+    // invalidate analyses for all functions in this SCC later.
+    FAM.invalidate(F, PreservedAnalyses::none());
   }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
@@ -1147,10 +1058,12 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   if (!Changed)
     return PreservedAnalyses::all();
 
+  PreservedAnalyses PA;
   // Even if we change the IR, we update the core CGSCC data structures and so
   // can preserve the proxy to the function analysis manager.
-  PreservedAnalyses PA;
   PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  // We have already invalidated all analyses on modified functions.
+  PA.preserveSet<AllAnalysesOn<Function>>();
   return PA;
 }
 
@@ -1173,7 +1086,11 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
 PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
                                                 ModuleAnalysisManager &MAM) {
   auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
-  if (!IAA.tryCreate(Params, Mode, CGSCCInlineReplayFile)) {
+  if (!IAA.tryCreate(Params, Mode,
+                     {CGSCCInlineReplayFile,
+                      CGSCCInlineReplayScope,
+                      CGSCCInlineReplayFallback,
+                      {CGSCCInlineReplayFormat}})) {
     M.getContext().emitError(
         "Could not setup Inlining Advisor for the requested "
         "mode and/or options");
@@ -1192,10 +1109,39 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
   else
     MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
         createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations)));
+
+  MPM.addPass(std::move(AfterCGMPM));
   MPM.run(M, MAM);
 
-  IAA.clear();
+  // Discard the InlineAdvisor, a subsequent inlining session should construct
+  // its own.
+  auto PA = PreservedAnalyses::all();
+  PA.abandon<InlineAdvisorAnalysis>();
+  return PA;
+}
 
-  // The ModulePassManager has already taken care of invalidating analyses.
-  return PreservedAnalyses::all();
+void InlinerPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<InlinerPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  if (OnlyMandatory)
+    OS << "<only-mandatory>";
+}
+
+void ModuleInlinerWrapperPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  // Print some info about passes added to the wrapper. This is however
+  // incomplete as InlineAdvisorAnalysis part isn't included (which also depends
+  // on Params and Mode).
+  if (!MPM.isEmpty()) {
+    MPM.printPipeline(OS, MapClassName2PassName);
+    OS << ",";
+  }
+  OS << "cgscc(";
+  if (MaxDevirtIterations != 0)
+    OS << "devirt<" << MaxDevirtIterations << ">(";
+  PM.printPipeline(OS, MapClassName2PassName);
+  if (MaxDevirtIterations != 0)
+    OS << ")";
+  OS << ")";
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp
index db3b4384ce67..692e445cb7cb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -201,21 +201,6 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
     AlwaysPreserved.insert(V->getName());
   }
 
-  // Mark all functions not in the api as internal.
-  IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm();
-  for (Function &I : M) {
-    if (!maybeInternalize(I, ComdatMap))
-      continue;
-    Changed = true;
-
-    if (ExternalNode)
-      // Remove a callgraph edge from the external node to this function.
-      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
-
-    ++NumFunctions;
-    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
-  }
-
   // Never internalize the llvm.used symbol.  It is used to implement
   // attribute((used)).
   // FIXME: Shouldn't this just filter on llvm.metadata section??
@@ -237,6 +222,21 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
   else
     AlwaysPreserved.insert("__stack_chk_guard");
 
+  // Mark all functions not in the api as internal.
+  IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm();
+  for (Function &I : M) {
+    if (!maybeInternalize(I, ComdatMap))
+      continue;
+    Changed = true;
+
+    if (ExternalNode)
+      // Remove a callgraph edge from the external node to this function.
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
+
+    ++NumFunctions;
+    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+  }
+
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   for (auto &GV : M.globals()) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index a497c0390bce..d9a59dd35fde 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -283,3 +283,13 @@ PreservedAnalyses LoopExtractorPass::run(Module &M, ModuleAnalysisManager &AM) {
   PA.preserve<LoopAnalysis>();
   return PA;
 }
+
+void LoopExtractorPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LoopExtractorPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (NumLoops == 1)
+    OS << "single";
+  OS << ">";
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index b492b200c6d5..f78971f0e586 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -342,7 +342,8 @@ private:
 struct ScopedSaveAliaseesAndUsed {
   Module &M;
   SmallVector<GlobalValue *, 4> Used, CompilerUsed;
-  std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
+  std::vector<std::pair<GlobalAlias *, Function *>> FunctionAliases;
+  std::vector<std::pair<GlobalIFunc *, Function *>> ResolverIFuncs;
 
   ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
     // The users of this class want to replace all function references except
@@ -362,13 +363,16 @@ struct ScopedSaveAliaseesAndUsed {
     if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
       GV->eraseFromParent();
 
-    for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
+    for (auto &GA : M.aliases()) {
       // FIXME: This should look past all aliases not just interposable ones,
       // see discussion on D65118.
-      if (auto *F =
-              dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
-        FunctionAliases.push_back({&GIS, F});
+      if (auto *F = dyn_cast<Function>(GA.getAliasee()->stripPointerCasts()))
+        FunctionAliases.push_back({&GA, F});
     }
+
+    for (auto &GI : M.ifuncs())
+      if (auto *F = dyn_cast<Function>(GI.getResolver()->stripPointerCasts()))
+        ResolverIFuncs.push_back({&GI, F});
   }
 
   ~ScopedSaveAliaseesAndUsed() {
@@ -376,8 +380,15 @@ struct ScopedSaveAliaseesAndUsed {
     appendToCompilerUsed(M, CompilerUsed);
 
     for (auto P : FunctionAliases)
-      P.first->setIndirectSymbol(
+      P.first->setAliasee(
           ConstantExpr::getBitCast(P.second, P.first->getType()));
+
+    for (auto P : ResolverIFuncs) {
+      // This does not preserve pointer casts that may have been stripped by the
+      // constructor, but the resolver's type is different from that of the
+      // ifunc anyway.
+      P.first->setResolver(P.second);
+    }
   }
 };
 
@@ -1550,17 +1561,28 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
               ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
                                    ConstantInt::get(IntPtrTy, I)}),
           F->getType());
-      if (Functions[I]->isExported()) {
-        if (IsJumpTableCanonical) {
-          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
-        } else {
-          GlobalAlias *JtAlias = GlobalAlias::create(
-              F->getValueType(), 0, GlobalValue::ExternalLinkage,
-              F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+
+      const bool IsExported = Functions[I]->isExported();
+      if (!IsJumpTableCanonical) {
+        GlobalValue::LinkageTypes LT = IsExported
+                                           ? GlobalValue::ExternalLinkage
+                                           : GlobalValue::InternalLinkage;
+        GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT,
+                                                   F->getName() + ".cfi_jt",
+                                                   CombinedGlobalElemPtr, &M);
+        if (IsExported)
           JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+        else
+          appendToUsed(M, {JtAlias});
+      }
+
+      if (IsExported) {
+        if (IsJumpTableCanonical)
+          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
+        else
           ExportSummary->cfiFunctionDecls().insert(std::string(F->getName()));
-        }
       }
+
       if (!IsJumpTableCanonical) {
         if (F->hasExternalWeakLinkage())
           replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
@@ -1751,11 +1773,7 @@ static bool isDirectCall(Use& U) {
 void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
                                           bool IsJumpTableCanonical) {
   SmallSetVector<Constant *, 4> Constants;
-  auto UI = Old->use_begin(), E = Old->use_end();
-  for (; UI != E;) {
-    Use &U = *UI;
-    ++UI;
-
+  for (Use &U : llvm::make_early_inc_range(Old->uses())) {
     // Skip block addresses
     if (isa<BlockAddress>(U.getUser()))
       continue;
@@ -1792,12 +1810,11 @@ bool LowerTypeTestsModule::lower() {
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
 
   if (DropTypeTests && TypeTestFunc) {
-    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
-         UI != UE;) {
-      auto *CI = cast<CallInst>((*UI++).getUser());
+    for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+      auto *CI = cast<CallInst>(U.getUser());
       // Find and erase llvm.assume intrinsics for this llvm.type.test call.
-      for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;)
-        if (auto *Assume = dyn_cast<AssumeInst>((*CIU++).getUser()))
+      for (Use &CIU : llvm::make_early_inc_range(CI->uses()))
+        if (auto *Assume = dyn_cast<AssumeInst>(CIU.getUser()))
           Assume->eraseFromParent();
       // If the assume was merged with another assume, we might have a use on a
       // phi (which will feed the assume). Simply replace the use on the phi
@@ -1835,13 +1852,9 @@ bool LowerTypeTestsModule::lower() {
     return false;
 
   if (ImportSummary) {
-    if (TypeTestFunc) {
-      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
-           UI != UE;) {
-        auto *CI = cast<CallInst>((*UI++).getUser());
-        importTypeTest(CI);
-      }
-    }
+    if (TypeTestFunc)
+      for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses()))
+        importTypeTest(cast<CallInst>(U.getUser()));
 
     if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
       report_fatal_error(
@@ -2100,11 +2113,11 @@ bool LowerTypeTestsModule::lower() {
       auto CI = cast<CallInst>(U.getUser());
 
       std::vector<GlobalTypeMember *> Targets;
-      if (CI->getNumArgOperands() % 2 != 1)
+      if (CI->arg_size() % 2 != 1)
         report_fatal_error("number of arguments should be odd");
 
       GlobalClassesTy::member_iterator CurSet;
-      for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
+      for (unsigned I = 1; I != CI->arg_size(); I += 2) {
         int64_t Offset;
         auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
             CI->getOperand(I), Offset, M.getDataLayout()));
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 9e6dd879ac01..97ef872c5499 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -463,17 +463,15 @@ bool MergeFunctions::runOnModule(Module &M) {
 // Replace direct callers of Old with New.
 void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
   Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
-  for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
-    Use *U = &*UI;
-    ++UI;
-    CallBase *CB = dyn_cast<CallBase>(U->getUser());
-    if (CB && CB->isCallee(U)) {
+  for (Use &U : llvm::make_early_inc_range(Old->uses())) {
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    if (CB && CB->isCallee(&U)) {
       // Do not copy attributes from the called function to the call-site.
       // Function comparison ensures that the attributes are the same up to
       // type congruences in byval(), in which case we need to keep the byval
       // type of the call-site, not the callee function.
       remove(CB->getFunction());
-      U->set(BitcastNew);
+      U.set(BitcastNew);
     }
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ModuleInliner.cpp
new file mode 100644
index 000000000000..ebf080e87c3b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -0,0 +1,354 @@
+//===- ModuleInliner.cpp - Code related to module inliner -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls in the module level. It doesn't need any infromation about
+// SCC or call graph, which is different from the SCC inliner.  The decisions of
+// which calls are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ModuleInliner.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InlineOrder.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cassert>
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "module-inline"
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+
+static cl::opt<bool> InlineEnablePriorityOrder(
+    "module-inline-enable-priority-order", cl::Hidden, cl::init(true),
+    cl::desc("Enable the priority inline order for the module inliner"));
+
+/// Return true if the specified inline history ID
+/// indicates an inline history that includes the specified function.
+static bool inlineHistoryIncludes(
+    Function *F, int InlineHistoryID,
+    const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
+  while (InlineHistoryID != -1) {
+    assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
+           "Invalid inline history ID");
+    if (InlineHistory[InlineHistoryID].first == F)
+      return true;
+    InlineHistoryID = InlineHistory[InlineHistoryID].second;
+  }
+  return false;
+}
+
+InlineAdvisor &ModuleInlinerPass::getAdvisor(const ModuleAnalysisManager &MAM,
+                                             FunctionAnalysisManager &FAM,
+                                             Module &M) {
+  if (OwnedAdvisor)
+    return *OwnedAdvisor;
+
+  auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
+  if (!IAA) {
+    // It should still be possible to run the inliner as a stand-alone module
+    // pass, for test scenarios. In that case, we default to the
+    // DefaultInlineAdvisor, which doesn't need to keep state between module
+    // pass runs. It also uses just the default InlineParams. In this case, we
+    // need to use the provided FAM, which is valid for the duration of the
+    // inliner pass, and thus the lifetime of the owned advisor. The one we
+    // would get from the MAM can be invalidated as a result of the inliner's
+    // activity.
+    OwnedAdvisor = std::make_unique<DefaultInlineAdvisor>(M, FAM, Params);
+
+    return *OwnedAdvisor;
+  }
+  assert(IAA->getAdvisor() &&
+         "Expected a present InlineAdvisorAnalysis also have an "
+         "InlineAdvisor initialized");
+  return *IAA->getAdvisor();
+}
+
+static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
+  LibFunc LF;
+
+  // Either this is a normal library function or a "vectorizable"
+  // function.  Not using the VFDatabase here because this query
+  // is related only to libraries handled via the TLI.
+  return TLI.getLibFunc(F, LF) ||
+         TLI.isKnownVectorFunctionInLibrary(F.getName());
+}
+
+PreservedAnalyses ModuleInlinerPass::run(Module &M,
+                                         ModuleAnalysisManager &MAM) {
+  LLVM_DEBUG(dbgs() << "---- Module Inliner is Running ---- \n");
+
+  auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
+  if (!IAA.tryCreate(Params, Mode, {})) {
+    M.getContext().emitError(
+        "Could not setup Inlining Advisor for the requested "
+        "mode and/or options");
+    return PreservedAnalyses::all();
+  }
+
+  bool Changed = false;
+
+  ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
+
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  InlineAdvisor &Advisor = getAdvisor(MAM, FAM, M);
+  Advisor.onPassEntry();
+
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
+
+  // In the module inliner, a priority-based worklist is used for calls across
+  // the entire Module. With this module inliner, the inline order is not
+  // limited to bottom-up order. More globally scope inline order is enabled.
+  // Also, the inline deferral logic become unnecessary in this module inliner.
+  // It is possible to use other priority heuristics, e.g. profile-based
+  // heuristic.
+  //
+  // TODO: Here is a huge amount duplicate code between the module inliner and
+  // the SCC inliner, which need some refactoring.
+  std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls;
+  if (InlineEnablePriorityOrder)
+    Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>();
+  else
+    Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>();
+  assert(Calls != nullptr && "Expected an initialized InlineOrder");
+
+  // Populate the initial list of calls in this module.
+  for (Function &F : M) {
+    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(F))
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction()) {
+          if (!Callee->isDeclaration())
+            Calls->push({CB, -1});
+          else if (!isa<IntrinsicInst>(I)) {
+            using namespace ore;
+            setInlineRemark(*CB, "unavailable definition");
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+                     << NV("Callee", Callee) << " will not be inlined into "
+                     << NV("Caller", CB->getCaller())
+                     << " because its definition is unavailable"
+                     << setIsVerbose();
+            });
+          }
+        }
+  }
+  if (Calls->empty())
+    return PreservedAnalyses::all();
+
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function *, int>, 16> InlineHistory;
+
+  // Track a set vector of inlined callees so that we can augment the caller
+  // with all of their edges in the call graph before pruning out the ones that
+  // got simplified away.
+  SmallSetVector<Function *, 4> InlinedCallees;
+
+  // Track the dead functions to delete once finished with inlining calls. We
+  // defer deleting these to make it easier to handle the call graph updates.
+  SmallVector<Function *, 4> DeadFunctions;
+
+  // Loop forward over all of the calls.
+  while (!Calls->empty()) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls->front().first->getCaller();
+
+    LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n"
+                      << "    Function size: " << F.getInstructionCount()
+                      << "\n");
+
+    auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+      return FAM.getResult<AssumptionAnalysis>(F);
+    };
+
+    // Now process as many calls as we have within this caller in the sequence.
+    // We bail out as soon as the caller has to change so we can
+    // prepare the context of that new caller.
+    bool DidInline = false;
+    while (!Calls->empty() && Calls->front().first->getCaller() == &F) {
+      auto P = Calls->pop();
+      CallBase *CB = P.first;
+      const int InlineHistoryID = P.second;
+      Function &Callee = *CB->getCalledFunction();
+
+      if (InlineHistoryID != -1 &&
+          inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+        setInlineRemark(*CB, "recursive");
+        continue;
+      }
+
+      auto Advice = Advisor.getAdvice(*CB, /*OnlyMandatory*/ false);
+      // Check whether we want to inline this callsite.
+      if (!Advice->isInliningRecommended()) {
+        Advice->recordUnattemptedInlining();
+        continue;
+      }
+
+      // Setup the data structure used to plumb customization into the
+      // `InlineFunction` routine.
+      InlineFunctionInfo IFI(
+          /*cg=*/nullptr, GetAssumptionCache, PSI,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+          &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
+      InlineResult IR =
+          InlineFunction(*CB, IFI, &FAM.getResult<AAManager>(*CB->getCaller()));
+      if (!IR.isSuccess()) {
+        Advice->recordUnsuccessfulInlining(IR);
+        continue;
+      }
+
+      DidInline = true;
+      InlinedCallees.insert(&Callee);
+      ++NumInlined;
+
+      LLVM_DEBUG(dbgs() << "    Size after inlining: "
+                        << F.getInstructionCount() << "\n");
+
+      // Add any new callsites to defined functions to the worklist.
+      if (!IFI.InlinedCallSites.empty()) {
+        int NewHistoryID = InlineHistory.size();
+        InlineHistory.push_back({&Callee, InlineHistoryID});
+
+        for (CallBase *ICB : reverse(IFI.InlinedCallSites)) {
+          Function *NewCallee = ICB->getCalledFunction();
+          if (!NewCallee) {
+            // Try to promote an indirect (virtual) call without waiting for
+            // the post-inline cleanup and the next DevirtSCCRepeatedPass
+            // iteration because the next iteration may not happen and we may
+            // miss inlining it.
+            if (tryPromoteCall(*ICB))
+              NewCallee = ICB->getCalledFunction();
+          }
+          if (NewCallee)
+            if (!NewCallee->isDeclaration())
+              Calls->push({ICB, NewHistoryID});
+        }
+      }
+
+      // Merge the attributes based on the inlining.
+      AttributeFuncs::mergeAttributesForInlining(F, Callee);
+
+      // For local functions, check whether this makes the callee trivially
+      // dead. In that case, we can drop the body of the function eagerly
+      // which may reduce the number of callers of other functions to one,
+      // changing inline cost thresholds.
+      bool CalleeWasDeleted = false;
+      if (Callee.hasLocalLinkage()) {
+        // To check this we also need to nuke any dead constant uses (perhaps
+        // made dead by this operation on other functions).
+        Callee.removeDeadConstantUsers();
+        // if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+        if (Callee.use_empty() && !isKnownLibFunction(Callee, GetTLI(Callee))) {
+          Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
+            return Call.first->getCaller() == &Callee;
+          });
+          // Clear the body and queue the function itself for deletion when we
+          // finish inlining.
+          // Note that after this point, it is an error to do anything other
+          // than use the callee's address or delete it.
+          Callee.dropAllReferences();
+          assert(!is_contained(DeadFunctions, &Callee) &&
+                 "Cannot put cause a function to become dead twice!");
+          DeadFunctions.push_back(&Callee);
+          CalleeWasDeleted = true;
+        }
+      }
+      if (CalleeWasDeleted)
+        Advice->recordInliningWithCalleeDeleted();
+      else
+        Advice->recordInlining();
+    }
+
+    if (!DidInline)
+      continue;
+    Changed = true;
+
+    InlinedCallees.clear();
+  }
+
+  // Now that we've finished inlining all of the calls across this module,
+  // delete all of the trivially dead functions.
+  //
+  // Note that this walks a pointer set which has non-deterministic order but
+  // that is OK as all we do is delete things and add pointers to unordered
+  // sets.
+  for (Function *DeadF : DeadFunctions) {
+    // Clear out any cached analyses.
+    FAM.clear(*DeadF, DeadF->getName());
+
+    // And delete the actual function from the module.
+    // The Advisor may use Function pointers to efficiently index various
+    // internal maps, e.g. for memoization. Function cleanup passes like
+    // argument promotion create new functions. It is possible for a new
+    // function to be allocated at the address of a deleted function. We could
+    // index using names, but that's inefficient. Alternatively, we let the
+    // Advisor free the functions when it sees fit.
+    DeadF->getBasicBlockList().clear();
+    M.getFunctionList().remove(DeadF);
+
+    ++NumDeleted;
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 75eec25f5807..f342c35fa283 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -33,6 +34,8 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
@@ -41,6 +44,8 @@
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 
+#include <algorithm>
+
 using namespace llvm;
 using namespace omp;
 
@@ -72,6 +77,46 @@ static cl::opt<bool> HideMemoryTransferLatency(
              " transfers"),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> DisableOpenMPOptDeglobalization(
+    "openmp-opt-disable-deglobalization", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations involving deglobalization."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> DisableOpenMPOptSPMDization(
+    "openmp-opt-disable-spmdization", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> DisableOpenMPOptFolding(
+    "openmp-opt-disable-folding", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
+    "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations that replace the state machine."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> PrintModuleAfterOptimizations(
+    "openmp-opt-print-module", cl::ZeroOrMore,
+    cl::desc("Print the current module after OpenMP optimizations."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> AlwaysInlineDeviceFunctions(
+    "openmp-opt-inline-device", cl::ZeroOrMore,
+    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool>
+    EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore,
+                         cl::desc("Enables more verbose remarks."), cl::Hidden,
+                         cl::init(false));
+
+static cl::opt<unsigned>
+    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
+                          cl::desc("Maximal number of attributor iterations."),
+                          cl::init(256));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -328,7 +373,7 @@ struct OMPInformationCache : public InformationCache {
     if (F->arg_size() != RTFArgTypes.size())
       return false;
 
-    auto RTFTyIt = RTFArgTypes.begin();
+    auto *RTFTyIt = RTFArgTypes.begin();
     for (Argument &Arg : F->args()) {
       if (Arg.getType() != *RTFTyIt)
         return false;
@@ -503,7 +548,7 @@ struct KernelInfoState : AbstractState {
   /// State to track if we are in SPMD-mode, assumed or know, and why we decided
   /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
   /// false.
-  BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker;
+  BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
 
   /// The __kmpc_target_init call in this kernel, if any. If we find more than
   /// one we abort as the kernel is malformed.
@@ -542,7 +587,9 @@ struct KernelInfoState : AbstractState {
   /// See AbstractState::indicatePessimisticFixpoint(...)
   ChangeStatus indicatePessimisticFixpoint() override {
     IsAtFixpoint = true;
+    ReachingKernelEntries.indicatePessimisticFixpoint();
     SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+    ReachedKnownParallelRegions.indicatePessimisticFixpoint();
     ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
     return ChangeStatus::CHANGED;
   }
@@ -550,6 +597,10 @@ struct KernelInfoState : AbstractState {
   /// See AbstractState::indicateOptimisticFixpoint(...)
   ChangeStatus indicateOptimisticFixpoint() override {
     IsAtFixpoint = true;
+    ReachingKernelEntries.indicateOptimisticFixpoint();
+    SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+    ReachedKnownParallelRegions.indicateOptimisticFixpoint();
+    ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
     return ChangeStatus::UNCHANGED;
   }
 
@@ -569,6 +620,12 @@ struct KernelInfoState : AbstractState {
     return true;
   }
 
+  /// Returns true if this kernel contains any OpenMP parallel regions.
+  bool mayContainParallelRegion() {
+    return !ReachedKnownParallelRegions.empty() ||
+           !ReachedUnknownParallelRegions.empty();
+  }
+
   /// Return empty set as the best state of potential values.
   static KernelInfoState getBestState() { return KernelInfoState(true); }
 
@@ -584,12 +641,14 @@ struct KernelInfoState : AbstractState {
     // Do not merge two different _init and _deinit call sites.
     if (KIS.KernelInitCB) {
       if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
-        indicatePessimisticFixpoint();
+        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
+                         "assumptions.");
       KernelInitCB = KIS.KernelInitCB;
     }
     if (KIS.KernelDeinitCB) {
       if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
-        indicatePessimisticFixpoint();
+        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
+                         "assumptions.");
       KernelDeinitCB = KIS.KernelDeinitCB;
     }
     SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
@@ -1032,8 +1091,8 @@ private:
         Args.clear();
         Args.push_back(OutlinedFn->getArg(0));
         Args.push_back(OutlinedFn->getArg(1));
-        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
-             U < E; ++U)
+        for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
+             ++U)
           Args.push_back(CI->getArgOperand(U));
 
         CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
@@ -1041,9 +1100,9 @@ private:
           NewCI->setDebugLoc(CI->getDebugLoc());
 
         // Forward parameter attributes from the callback to the callee.
-        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
-             U < E; ++U)
-          for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+        for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
+             ++U)
+          for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
             NewCI->addParamAttr(
                 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
 
@@ -1563,13 +1622,13 @@ private:
 
     // TODO: Use dominance to find a good position instead.
     auto CanBeMoved = [this](CallBase &CB) {
-      unsigned NumArgs = CB.getNumArgOperands();
+      unsigned NumArgs = CB.arg_size();
       if (NumArgs == 0)
         return true;
       if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
         return false;
-      for (unsigned u = 1; u < NumArgs; ++u)
-        if (isa<Instruction>(CB.getArgOperand(u)))
+      for (unsigned U = 1; U < NumArgs; ++U)
+        if (isa<Instruction>(CB.getArgOperand(U)))
           return false;
       return true;
     };
@@ -1612,7 +1671,7 @@ private:
     // valid at the new location. For now we just pick a global one, either
     // existing and used by one of the calls, or created from scratch.
     if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
-      if (CI->getNumArgOperands() > 0 &&
+      if (!CI->arg_empty() &&
           CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
         Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
                                                       /* GlobalOnly */ true);
@@ -1695,8 +1754,8 @@ private:
     // Transitively search for more arguments by looking at the users of the
     // ones we know already. During the search the GTIdArgs vector is extended
     // so we cannot cache the size nor can we use a range based for.
-    for (unsigned u = 0; u < GTIdArgs.size(); ++u)
-      AddUserArgs(*GTIdArgs[u]);
+    for (unsigned U = 0; U < GTIdArgs.size(); ++U)
+      AddUserArgs(*GTIdArgs[U]);
   }
 
   /// Kernel (=GPU) optimizations and utility functions
@@ -1822,6 +1881,10 @@ private:
                                     OMPRTL___kmpc_kernel_end_parallel);
     ExternalizationRAII BarrierSPMD(OMPInfoCache,
                                     OMPRTL___kmpc_barrier_simple_spmd);
+    ExternalizationRAII BarrierGeneric(OMPInfoCache,
+                                       OMPRTL___kmpc_barrier_simple_generic);
+    ExternalizationRAII ThreadId(OMPInfoCache,
+                                 OMPRTL___kmpc_get_hardware_thread_id_in_block);
 
     registerAAs(IsModulePass);
 
@@ -1918,6 +1981,10 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
   if (!KernelParallelRFI)
     return Changed;
 
+  // If we have disabled state machine changes, exit
+  if (DisableOpenMPOptStateMachineRewrite)
+    return Changed;
+
   for (Function *F : SCC) {
 
     // Check if the function is a use in a __kmpc_parallel_51 call at
@@ -2509,9 +2576,8 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
   auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
 
-  // Check if the edge into the successor block compares the __kmpc_target_init
-  // result with -1. If we are in non-SPMD-mode that signals only the main
-  // thread will execute the edge.
+  // Check if the edge into the successor block contains a condition that only
+  // lets the main thread execute it.
   auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
     if (!Edge || !Edge->isConditional())
       return false;
@@ -2526,16 +2592,27 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     if (!C)
       return false;
 
-    // Match:  -1 == __kmpc_target_init (for non-SPMD kernels only!)
+    // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
     if (C->isAllOnesValue()) {
       auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
       CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
       if (!CB)
         return false;
-      const int InitIsSPMDArgNo = 1;
-      auto *IsSPMDModeCI =
-          dyn_cast<ConstantInt>(CB->getOperand(InitIsSPMDArgNo));
-      return IsSPMDModeCI && IsSPMDModeCI->isZero();
+      const int InitModeArgNo = 1;
+      auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo));
+      return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC);
+    }
+
+    if (C->isZero()) {
+      // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
+      if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
+        if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
+          return true;
+
+      // Match: 0 == llvm.amdgcn.workitem.id.x()
+      if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
+        if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
+          return true;
     }
 
     return false;
@@ -2544,15 +2621,14 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   // Merge all the predecessor states into the current basic block. A basic
   // block is executed by a single thread if all of its predecessors are.
   auto MergePredecessorStates = [&](BasicBlock *BB) {
-    if (pred_begin(BB) == pred_end(BB))
+    if (pred_empty(BB))
       return SingleThreadedBBs.contains(BB);
 
     bool IsInitialThread = true;
-    for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
-         PredBB != PredEndBB; ++PredBB) {
-      if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()),
                                BB))
-        IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
+        IsInitialThread &= SingleThreadedBBs.contains(PredBB);
     }
 
     return IsInitialThread;
@@ -2684,9 +2760,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
 
       ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
 
-      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
-                        << CB->getCaller()->getName() << " with "
-                        << AllocSize->getZExtValue()
+      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
+                        << " with " << AllocSize->getZExtValue()
                         << " bytes of shared memory\n");
 
       // Create a new shared memory buffer of the same size as the allocation
@@ -2735,7 +2810,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       const auto &ED = A.getAAFor<AAExecutionDomain>(
           *this, IRPosition::function(*F), DepClassTy::REQUIRED);
       if (CallBase *CB = dyn_cast<CallBase>(U))
-        if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) ||
+        if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
             !ED.isExecutedByInitialThreadOnly(*CB))
           MallocCalls.erase(CB);
     }
@@ -2770,9 +2845,17 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
            std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
                                                                : "") +
            std::string(" #PRs: ") +
-           std::to_string(ReachedKnownParallelRegions.size()) +
+           (ReachedKnownParallelRegions.isValidState()
+                ? std::to_string(ReachedKnownParallelRegions.size())
+                : "<invalid>") +
            ", #Unknown PRs: " +
-           std::to_string(ReachedUnknownParallelRegions.size());
+           (ReachedUnknownParallelRegions.isValidState()
+                ? std::to_string(ReachedUnknownParallelRegions.size())
+                : "<invalid>") +
+           ", #Reaching Kernels: " +
+           (ReachingKernelEntries.isValidState()
+                ? std::to_string(ReachingKernelEntries.size())
+                : "<invalid>");
   }
 
   /// Create an abstract attribute biew for the position \p IRP.
@@ -2798,6 +2881,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
   AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
       : AAKernelInfo(IRP, A) {}
 
+  SmallPtrSet<Instruction *, 4> GuardedInstructions;
+
+  SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
+    return GuardedInstructions;
+  }
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     // This is a high-level transform that might change the constant arguments
@@ -2844,8 +2933,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
         },
         Fn);
 
-    assert((KernelInitCB && KernelDeinitCB) &&
-           "Kernel without __kmpc_target_init or __kmpc_target_deinit!");
+    // Ignore kernels without initializers such as global constructors.
+    if (!KernelInitCB || !KernelDeinitCB) {
+      indicateOptimisticFixpoint();
+      return;
+    }
 
     // For kernels we might need to initialize/finalize the IsSPMD state and
     // we need to register a simplification callback so that the Attributor
@@ -2860,7 +2952,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
       // state. As long as we are not in an invalid state, we will create a
       // custom state machine so the value should be a `i1 false`. If we are
       // in an invalid state, we won't change the value that is in the IR.
-      if (!isValidState())
+      if (!ReachedKnownParallelRegions.isValidState())
+        return nullptr;
+      // If we have disabled state machine rewrites, don't make a custom one.
+      if (DisableOpenMPOptStateMachineRewrite)
         return nullptr;
       if (AA)
         A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
@@ -2870,7 +2965,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return FalseVal;
     };
 
-    Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB =
+    Attributor::SimplifictionCallbackTy ModeSimplifyCB =
         [&](const IRPosition &IRP, const AbstractAttribute *AA,
             bool &UsedAssumedInformation) -> Optional<Value *> {
       // IRP represents the "SPMDCompatibilityTracker" argument of an
@@ -2886,8 +2981,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
       } else {
         UsedAssumedInformation = false;
       }
-      auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
-                                       SPMDCompatibilityTracker.isAssumed());
+      auto *Val = ConstantInt::getSigned(
+          IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()),
+          SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD
+                                               : OMP_TGT_EXEC_MODE_GENERIC);
       return Val;
     };
 
@@ -2912,8 +3009,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return Val;
     };
 
-    constexpr const int InitIsSPMDArgNo = 1;
-    constexpr const int DeinitIsSPMDArgNo = 1;
+    constexpr const int InitModeArgNo = 1;
+    constexpr const int DeinitModeArgNo = 1;
     constexpr const int InitUseStateMachineArgNo = 2;
     constexpr const int InitRequiresFullRuntimeArgNo = 3;
     constexpr const int DeinitRequiresFullRuntimeArgNo = 2;
@@ -2921,11 +3018,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
         IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
         StateMachineSimplifyCB);
     A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo),
-        IsSPMDModeSimplifyCB);
+        IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
+        ModeSimplifyCB);
     A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo),
-        IsSPMDModeSimplifyCB);
+        IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
+        ModeSimplifyCB);
     A.registerSimplificationCallback(
         IRPosition::callsite_argument(*KernelInitCB,
                                       InitRequiresFullRuntimeArgNo),
@@ -2936,10 +3033,25 @@ struct AAKernelInfoFunction : AAKernelInfo {
         IsGenericModeSimplifyCB);
 
     // Check if we know we are in SPMD-mode already.
-    ConstantInt *IsSPMDArg =
-        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
-    if (IsSPMDArg && !IsSPMDArg->isZero())
+    ConstantInt *ModeArg =
+        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
+    if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+    // This is a generic region but SPMDization is disabled so stop tracking.
+    else if (DisableOpenMPOptSPMDization)
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+  }
+
+  /// Sanitize the string \p S such that it is a suitable global symbol name.
+  static std::string sanitizeForGlobalName(std::string S) {
+    std::replace_if(
+        S.begin(), S.end(),
+        [](const char C) {
+          return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
+                   (C >= '0' && C <= '9') || C == '_');
+        },
+        '.');
+    return S;
   }
 
   /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
@@ -2950,19 +3062,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!KernelInitCB || !KernelDeinitCB)
       return ChangeStatus::UNCHANGED;
 
-    // Known SPMD-mode kernels need no manifest changes.
-    if (SPMDCompatibilityTracker.isKnown())
-      return ChangeStatus::UNCHANGED;
-
     // If we can we change the execution mode to SPMD-mode otherwise we build a
     // custom state machine.
-    if (!changeToSPMDMode(A))
-      buildCustomStateMachine(A);
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (!changeToSPMDMode(A, Changed))
+      return buildCustomStateMachine(A);
 
-    return ChangeStatus::CHANGED;
+    return Changed;
   }
 
-  bool changeToSPMDMode(Attributor &A) {
+  bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
 
     if (!SPMDCompatibilityTracker.isAssumed()) {
@@ -2994,38 +3103,259 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return false;
     }
 
-    // Adjust the global exec mode flag that tells the runtime what mode this
-    // kernel is executed in.
+    // Check if the kernel is already in SPMD mode, if so, return success.
     Function *Kernel = getAnchorScope();
     GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
         (Kernel->getName() + "_exec_mode").str());
     assert(ExecMode && "Kernel without exec mode?");
-    assert(ExecMode->getInitializer() &&
-           ExecMode->getInitializer()->isOneValue() &&
-           "Initially non-SPMD kernel has SPMD exec mode!");
+    assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
 
     // Set the global exec mode flag to indicate SPMD-Generic mode.
-    constexpr int SPMDGeneric = 2;
-    if (!ExecMode->getInitializer()->isZeroValue())
-      ExecMode->setInitializer(
-          ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric));
+    assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+           "ExecMode is not an integer!");
+    const int8_t ExecModeVal =
+        cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+    if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
+      return true;
+
+    // We will now unconditionally modify the IR, indicate a change.
+    Changed = ChangeStatus::CHANGED;
+
+    auto CreateGuardedRegion = [&](Instruction *RegionStartI,
+                                   Instruction *RegionEndI) {
+      LoopInfo *LI = nullptr;
+      DominatorTree *DT = nullptr;
+      MemorySSAUpdater *MSU = nullptr;
+      using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+      BasicBlock *ParentBB = RegionStartI->getParent();
+      Function *Fn = ParentBB->getParent();
+      Module &M = *Fn->getParent();
+
+      // Create all the blocks and logic.
+      // ParentBB:
+      //    goto RegionCheckTidBB
+      // RegionCheckTidBB:
+      //    Tid = __kmpc_hardware_thread_id()
+      //    if (Tid != 0)
+      //        goto RegionBarrierBB
+      // RegionStartBB:
+      //    <execute instructions guarded>
+      //    goto RegionEndBB
+      // RegionEndBB:
+      //    <store escaping values to shared mem>
+      //    goto RegionBarrierBB
+      //  RegionBarrierBB:
+      //    __kmpc_simple_barrier_spmd()
+      //    // second barrier is omitted if lacking escaping values.
+      //    <load escaping values from shared mem>
+      //    __kmpc_simple_barrier_spmd()
+      //    goto RegionExitBB
+      // RegionExitBB:
+      //    <execute rest of instructions>
+
+      BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
+                                           DT, LI, MSU, "region.guarded.end");
+      BasicBlock *RegionBarrierBB =
+          SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
+                     MSU, "region.barrier");
+      BasicBlock *RegionExitBB =
+          SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
+                     DT, LI, MSU, "region.exit");
+      BasicBlock *RegionStartBB =
+          SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
+
+      assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
+             "Expected a different CFG");
+
+      BasicBlock *RegionCheckTidBB = SplitBlock(
+          ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
+
+      // Register basic blocks with the Attributor.
+      A.registerManifestAddedBasicBlock(*RegionEndBB);
+      A.registerManifestAddedBasicBlock(*RegionBarrierBB);
+      A.registerManifestAddedBasicBlock(*RegionExitBB);
+      A.registerManifestAddedBasicBlock(*RegionStartBB);
+      A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
+
+      bool HasBroadcastValues = false;
+      // Find escaping outputs from the guarded region to outside users and
+      // broadcast their values to them.
+      for (Instruction &I : *RegionStartBB) {
+        SmallPtrSet<Instruction *, 4> OutsideUsers;
+        for (User *Usr : I.users()) {
+          Instruction &UsrI = *cast<Instruction>(Usr);
+          if (UsrI.getParent() != RegionStartBB)
+            OutsideUsers.insert(&UsrI);
+        }
+
+        if (OutsideUsers.empty())
+          continue;
+
+        HasBroadcastValues = true;
+
+        // Emit a global variable in shared memory to store the broadcasted
+        // value.
+        auto *SharedMem = new GlobalVariable(
+            M, I.getType(), /* IsConstant */ false,
+            GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
+            sanitizeForGlobalName(
+                (I.getName() + ".guarded.output.alloc").str()),
+            nullptr, GlobalValue::NotThreadLocal,
+            static_cast<unsigned>(AddressSpace::Shared));
+
+        // Emit a store instruction to update the value.
+        new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
+
+        LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
+                                       I.getName() + ".guarded.output.load",
+                                       RegionBarrierBB->getTerminator());
+
+        // Emit a load instruction and replace uses of the output value.
+        for (Instruction *UsrI : OutsideUsers)
+          UsrI->replaceUsesOfWith(&I, LoadI);
+      }
+
+      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+      // Go to tid check BB in ParentBB.
+      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+      ParentBB->getTerminator()->eraseFromParent();
+      OpenMPIRBuilder::LocationDescription Loc(
+          InsertPointTy(ParentBB, ParentBB->end()), DL);
+      OMPInfoCache.OMPBuilder.updateToLocation(Loc);
+      auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
+      Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
+      BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
+
+      // Add check for Tid in RegionCheckTidBB
+      RegionCheckTidBB->getTerminator()->eraseFromParent();
+      OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
+          InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
+      OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
+      FunctionCallee HardwareTidFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
+      Value *Tid =
+          OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
+      Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
+      OMPInfoCache.OMPBuilder.Builder
+          .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
+          ->setDebugLoc(DL);
+
+      // First barrier for synchronization, ensures main thread has updated
+      // values.
+      FunctionCallee BarrierFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_barrier_simple_spmd);
+      OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
+          RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
+      OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
+          ->setDebugLoc(DL);
+
+      // Second barrier ensures workers have read broadcast values.
+      if (HasBroadcastValues)
+        CallInst::Create(BarrierFn, {Ident, Tid}, "",
+                         RegionBarrierBB->getTerminator())
+            ->setDebugLoc(DL);
+    };
+
+    auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+    SmallPtrSet<BasicBlock *, 8> Visited;
+    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+      BasicBlock *BB = GuardedI->getParent();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
+      Instruction *LastEffect = nullptr;
+      BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
+      while (++IP != IPEnd) {
+        if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
+          continue;
+        Instruction *I = &*IP;
+        if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
+          continue;
+        if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
+          LastEffect = nullptr;
+          continue;
+        }
+        if (LastEffect)
+          Reorders.push_back({I, LastEffect});
+        LastEffect = &*IP;
+      }
+      for (auto &Reorder : Reorders)
+        Reorder.first->moveBefore(Reorder.second);
+    }
+
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
+
+    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+      BasicBlock *BB = GuardedI->getParent();
+      auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
+          IRPosition::function(*GuardedI->getFunction()), nullptr,
+          DepClassTy::NONE);
+      assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
+      auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
+      // Continue if instruction is already guarded.
+      if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
+        continue;
+
+      Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
+      for (Instruction &I : *BB) {
+        // If instruction I needs to be guarded update the guarded region
+        // bounds.
+        if (SPMDCompatibilityTracker.contains(&I)) {
+          CalleeAAFunction.getGuardedInstructions().insert(&I);
+          if (GuardedRegionStart)
+            GuardedRegionEnd = &I;
+          else
+            GuardedRegionStart = GuardedRegionEnd = &I;
+
+          continue;
+        }
+
+        // Instruction I does not need guarding, store
+        // any region found and reset bounds.
+        if (GuardedRegionStart) {
+          GuardedRegions.push_back(
+              std::make_pair(GuardedRegionStart, GuardedRegionEnd));
+          GuardedRegionStart = nullptr;
+          GuardedRegionEnd = nullptr;
+        }
+      }
+    }
+
+    for (auto &GR : GuardedRegions)
+      CreateGuardedRegion(GR.first, GR.second);
+
+    // Adjust the global exec mode flag that tells the runtime what mode this
+    // kernel is executed in.
+    assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
+           "Initially non-SPMD kernel has SPMD exec mode!");
+    ExecMode->setInitializer(
+        ConstantInt::get(ExecMode->getInitializer()->getType(),
+                         ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
 
     // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
-    const int InitIsSPMDArgNo = 1;
-    const int DeinitIsSPMDArgNo = 1;
+    const int InitModeArgNo = 1;
+    const int DeinitModeArgNo = 1;
     const int InitUseStateMachineArgNo = 2;
     const int InitRequiresFullRuntimeArgNo = 3;
     const int DeinitRequiresFullRuntimeArgNo = 2;
 
     auto &Ctx = getAnchorValue().getContext();
-    A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo),
-                             *ConstantInt::getBool(Ctx, 1));
+    A.changeUseAfterManifest(
+        KernelInitCB->getArgOperandUse(InitModeArgNo),
+        *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
+                                OMP_TGT_EXEC_MODE_SPMD));
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
         *ConstantInt::getBool(Ctx, 0));
     A.changeUseAfterManifest(
-        KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo),
-        *ConstantInt::getBool(Ctx, 1));
+        KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
+        *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
+                                OMP_TGT_EXEC_MODE_SPMD));
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
         *ConstantInt::getBool(Ctx, 0));
@@ -3043,10 +3373,15 @@ struct AAKernelInfoFunction : AAKernelInfo {
   };
 
   ChangeStatus buildCustomStateMachine(Attributor &A) {
-    assert(ReachedKnownParallelRegions.isValidState() &&
-           "Custom state machine with invalid parallel region states?");
+    // If we have disabled state machine rewrites, don't make a custom one
+    if (DisableOpenMPOptStateMachineRewrite)
+      return ChangeStatus::UNCHANGED;
+
+    // Don't rewrite the state machine if we are not in a valid state.
+    if (!ReachedKnownParallelRegions.isValidState())
+      return ChangeStatus::UNCHANGED;
 
-    const int InitIsSPMDArgNo = 1;
+    const int InitModeArgNo = 1;
     const int InitUseStateMachineArgNo = 2;
 
     // Check if the current configuration is non-SPMD and generic state machine.
@@ -3055,14 +3390,14 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // we give up.
     ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
         KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
-    ConstantInt *IsSPMD =
-        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
+    ConstantInt *Mode =
+        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
 
     // If we are stuck with generic mode, try to create a custom device (=GPU)
     // state machine which is specialized for the parallel regions that are
     // reachable by the kernel.
-    if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD ||
-        !IsSPMD->isZero())
+    if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
+        (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
       return ChangeStatus::UNCHANGED;
 
     // If not SPMD mode, indicate we use a custom state machine now.
@@ -3075,8 +3410,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // happen if there simply are no parallel regions. In the resulting kernel
     // all worker threads will simply exit right away, leaving the main thread
     // to do the work alone.
-    if (ReachedKnownParallelRegions.empty() &&
-        ReachedUnknownParallelRegions.empty()) {
+    if (!mayContainParallelRegion()) {
       ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
 
       auto Remark = [&](OptimizationRemark OR) {
@@ -3122,9 +3456,14 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // Create all the blocks:
     //
     //                       InitCB = __kmpc_target_init(...)
-    //                       bool IsWorker = InitCB >= 0;
+    //                       BlockHwSize =
+    //                         __kmpc_get_hardware_num_threads_in_block();
+    //                       WarpSize = __kmpc_get_warp_size();
+    //                       BlockSize = BlockHwSize - WarpSize;
+    //                       if (InitCB >= BlockSize) return;
+    // IsWorkerCheckBB:      bool IsWorker = InitCB >= 0;
     //                       if (IsWorker) {
-    // SMBeginBB:               __kmpc_barrier_simple_spmd(...);
+    // SMBeginBB:               __kmpc_barrier_simple_generic(...);
     //                         void *WorkFn;
     //                         bool Active = __kmpc_kernel_parallel(&WorkFn);
     //                         if (!WorkFn) return;
@@ -3138,7 +3477,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     //                              ((WorkFnTy*)WorkFn)(...);
     // SMEndParallelBB:           __kmpc_kernel_end_parallel(...);
     //                          }
-    // SMDoneBB:                __kmpc_barrier_simple_spmd(...);
+    // SMDoneBB:                __kmpc_barrier_simple_generic(...);
     //                          goto SMBeginBB;
     //                       }
     // UserCodeEntryBB:      // user code
@@ -3150,6 +3489,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
     BasicBlock *InitBB = KernelInitCB->getParent();
     BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
         KernelInitCB->getNextNode(), "thread.user_code.check");
+    BasicBlock *IsWorkerCheckBB =
+        BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
     BasicBlock *StateMachineBeginBB = BasicBlock::Create(
         Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
     BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
@@ -3166,6 +3507,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
         Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
     A.registerManifestAddedBasicBlock(*InitBB);
     A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
+    A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
     A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
     A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
     A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
@@ -3175,16 +3517,38 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
     const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
     ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
-
     InitBB->getTerminator()->eraseFromParent();
+
+    Module &M = *Kernel->getParent();
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    FunctionCallee BlockHwSizeFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
+    FunctionCallee WarpSizeFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_get_warp_size);
+    Instruction *BlockHwSize =
+        CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
+    BlockHwSize->setDebugLoc(DLoc);
+    Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+    WarpSize->setDebugLoc(DLoc);
+    Instruction *BlockSize =
+        BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
+    BlockSize->setDebugLoc(DLoc);
+    Instruction *IsMainOrWorker =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB,
+                         BlockSize, "thread.is_main_or_worker", InitBB);
+    IsMainOrWorker->setDebugLoc(DLoc);
+    BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker,
+                       InitBB);
+
     Instruction *IsWorker =
         ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
                          ConstantInt::get(KernelInitCB->getType(), -1),
-                         "thread.is_worker", InitBB);
+                         "thread.is_worker", IsWorkerCheckBB);
     IsWorker->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
-
-    Module &M = *Kernel->getParent();
+    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker,
+                       IsWorkerCheckBB);
 
     // Create local storage for the work function pointer.
     const DataLayout &DL = M.getDataLayout();
@@ -3194,7 +3558,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
                        "worker.work_fn.addr", &Kernel->getEntryBlock().front());
     WorkFnAI->setDebugLoc(DLoc);
 
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     OMPInfoCache.OMPBuilder.updateToLocation(
         OpenMPIRBuilder::LocationDescription(
             IRBuilder<>::InsertPoint(StateMachineBeginBB,
@@ -3206,7 +3569,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
     FunctionCallee BarrierFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
-            M, OMPRTL___kmpc_barrier_simple_spmd);
+            M, OMPRTL___kmpc_barrier_simple_generic);
     CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
         ->setDebugLoc(DLoc);
 
@@ -3258,8 +3621,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // Now that we have most of the CFG skeleton it is time for the if-cascade
     // that checks the function pointer we got from the runtime against the
     // parallel regions we expect, if there are any.
-    for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) {
-      auto *ParallelRegion = ReachedKnownParallelRegions[i];
+    for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
+      auto *ParallelRegion = ReachedKnownParallelRegions[I];
       BasicBlock *PRExecuteBB = BasicBlock::Create(
           Ctx, "worker_state_machine.parallel_region.execute", Kernel,
           StateMachineEndParallelBB);
@@ -3275,7 +3638,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
       // Check if we need to compare the pointer at all or if we can just
       // call the parallel region function.
       Value *IsPR;
-      if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) {
+      if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
         Instruction *CmpI = ICmpInst::Create(
             ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
             "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
@@ -3339,8 +3702,21 @@ struct AAKernelInfoFunction : AAKernelInfo {
         if (llvm::all_of(Objects,
                          [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
           return true;
+        // Check for AAHeapToStack moved objects which must not be guarded.
+        auto &HS = A.getAAFor<AAHeapToStack>(
+            *this, IRPosition::function(*I.getFunction()),
+            DepClassTy::OPTIONAL);
+        if (llvm::all_of(Objects, [&HS](const Value *Obj) {
+              auto *CB = dyn_cast<CallBase>(Obj);
+              if (!CB)
+                return false;
+              return HS.isAssumedHeapToStack(*CB);
+            })) {
+          return true;
+        }
       }
-      // For now we give up on everything but stores.
+
+      // Insert instruction that needs guarding.
       SPMDCompatibilityTracker.insert(&I);
       return true;
     };
@@ -3354,9 +3730,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!IsKernelEntry) {
       updateReachingKernelEntries(A);
       updateParallelLevels(A);
+
+      if (!ParallelLevels.isValidState())
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
     }
 
     // Callback to check a call instruction.
+    bool AllParallelRegionStatesWereFixed = true;
     bool AllSPMDStatesWereFixed = true;
     auto CheckCallInst = [&](Instruction &I) {
       auto &CB = cast<CallBase>(I);
@@ -3364,13 +3744,37 @@ struct AAKernelInfoFunction : AAKernelInfo {
           *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
       getState() ^= CBAA.getState();
       AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
+      AllParallelRegionStatesWereFixed &=
+          CBAA.ReachedKnownParallelRegions.isAtFixpoint();
+      AllParallelRegionStatesWereFixed &=
+          CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
       return true;
     };
 
     bool UsedAssumedInformationInCheckCallInst = false;
     if (!A.checkForAllCallLikeInstructions(
-            CheckCallInst, *this, UsedAssumedInformationInCheckCallInst))
+            CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
+      LLVM_DEBUG(dbgs() << TAG
+                        << "Failed to visit all call-like instructions!\n";);
       return indicatePessimisticFixpoint();
+    }
+
+    // If we haven't used any assumed information for the reached parallel
+    // region states we can fix it.
+    if (!UsedAssumedInformationInCheckCallInst &&
+        AllParallelRegionStatesWereFixed) {
+      ReachedKnownParallelRegions.indicateOptimisticFixpoint();
+      ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
+    }
+
+    // If we are sure there are no parallel regions in the kernel we do not
+    // want SPMD mode.
+    if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
+        ReachedKnownParallelRegions.isAtFixpoint() &&
+        ReachedUnknownParallelRegions.isValidState() &&
+        ReachedKnownParallelRegions.isValidState() &&
+        !mayContainParallelRegion())
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
 
     // If we haven't used any assumed information for the SPMD state we can fix
     // it.
@@ -3469,14 +3873,14 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     CallBase &CB = cast<CallBase>(getAssociatedValue());
     Function *Callee = getAssociatedFunction();
 
-    // Helper to lookup an assumption string.
-    auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) {
-      return Fn && hasAssumption(*Fn, AssumptionStr);
-    };
+    auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
+        *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
 
     // Check for SPMD-mode assumptions.
-    if (HasAssumption(Callee, "ompx_spmd_amenable"))
+    if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+      indicateOptimisticFixpoint();
+    }
 
     // First weed out calls we do not care about, that is readonly/readnone
     // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
@@ -3498,14 +3902,16 @@ struct AAKernelInfoCallSite : AAKernelInfo {
 
         // Unknown callees might contain parallel regions, except if they have
         // an appropriate assumption attached.
-        if (!(HasAssumption(Callee, "omp_no_openmp") ||
-              HasAssumption(Callee, "omp_no_parallelism")))
+        if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
+              AssumptionAA.hasAssumption("omp_no_parallelism")))
           ReachedUnknownParallelRegions.insert(&CB);
 
         // If SPMDCompatibilityTracker is not fixed, we need to give up on the
         // idea we can run something unknown in SPMD-mode.
-        if (!SPMDCompatibilityTracker.isAtFixpoint())
+        if (!SPMDCompatibilityTracker.isAtFixpoint()) {
+          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
           SPMDCompatibilityTracker.insert(&CB);
+        }
 
         // We have updated the state for this unknown call properly, there won't
         // be any change so we indicate a fixpoint.
@@ -3521,6 +3927,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     switch (RF) {
     // All the functions we know are compatible with SPMD mode.
     case OMPRTL___kmpc_is_spmd_exec_mode:
+    case OMPRTL___kmpc_distribute_static_fini:
     case OMPRTL___kmpc_for_static_fini:
     case OMPRTL___kmpc_global_thread_num:
     case OMPRTL___kmpc_get_hardware_num_threads_in_block:
@@ -3531,6 +3938,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     case OMPRTL___kmpc_end_master:
     case OMPRTL___kmpc_barrier:
       break;
+    case OMPRTL___kmpc_distribute_static_init_4:
+    case OMPRTL___kmpc_distribute_static_init_4u:
+    case OMPRTL___kmpc_distribute_static_init_8:
+    case OMPRTL___kmpc_distribute_static_init_8u:
     case OMPRTL___kmpc_for_static_init_4:
     case OMPRTL___kmpc_for_static_init_4u:
     case OMPRTL___kmpc_for_static_init_8:
@@ -3548,6 +3959,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       case OMPScheduleType::DistributeChunked:
         break;
       default:
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
         SPMDCompatibilityTracker.insert(&CB);
         break;
       };
@@ -3580,7 +3992,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       return;
     default:
       // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
-      // generally.
+      // generally. However, they do not hide parallel regions.
       SPMDCompatibilityTracker.insert(&CB);
       break;
     }
@@ -3700,6 +4112,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
   }
 
   void initialize(Attributor &A) override {
+    if (DisableOpenMPOptFolding)
+      indicatePessimisticFixpoint();
+
     Function *Callee = getAssociatedFunction();
 
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
@@ -3756,11 +4171,24 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
     if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
-      Instruction &CB = *getCtxI();
-      A.changeValueAfterManifest(CB, **SimplifiedValue);
-      A.deleteAfterManifest(CB);
+      Instruction &I = *getCtxI();
+      A.changeValueAfterManifest(I, **SimplifiedValue);
+      A.deleteAfterManifest(I);
 
-      LLVM_DEBUG(dbgs() << TAG << "Folding runtime call: " << CB << " with "
+      CallBase *CB = dyn_cast<CallBase>(&I);
+      auto Remark = [&](OptimizationRemark OR) {
+        if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
+          return OR << "Replacing OpenMP runtime call "
+                    << CB->getCalledFunction()->getName() << " with "
+                    << ore::NV("FoldedValue", C->getZExtValue()) << ".";
+        return OR << "Replacing OpenMP runtime call "
+                  << CB->getCalledFunction()->getName() << ".";
+      };
+
+      if (CB && EnableVerboseRemarks)
+        A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
+
+      LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
                         << **SimplifiedValue << "\n");
 
       Changed = ChangeStatus::CHANGED;
@@ -3994,7 +4422,6 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
           DepClassTy::NONE, /* ForceUpdate */ false,
           /* UpdateAfterInit */ false);
 
-
     registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
     registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
     registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
@@ -4027,7 +4454,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
     A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
     return false;
   };
-  GlobalizationRFI.foreachUse(SCC, CreateAA);
+  if (!DisableOpenMPOptDeglobalization)
+    GlobalizationRFI.foreachUse(SCC, CreateAA);
 
   // Create an ExecutionDomain AA for every function and a HeapToStack AA for
   // every function if there is a device kernel.
@@ -4039,7 +4467,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
       continue;
 
     A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
-    A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
+    if (!DisableOpenMPOptDeglobalization)
+      A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
 
     for (auto &I : instructions(*F)) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
@@ -4234,12 +4663,24 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
   SetVector<Function *> Functions(SCC.begin(), SCC.end());
   OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
 
-  unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
+  unsigned MaxFixpointIterations =
+      (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
   Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
                MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(true);
+
+  // Optionally inline device functions for potentially better performance.
+  if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
+    for (Function &F : M)
+      if (!F.isDeclaration() && !Kernels.contains(&F) &&
+          !F.hasFnAttribute(Attribute::NoInline))
+        F.addFnAttr(Attribute::AlwaysInline);
+
+  if (PrintModuleAfterOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
+
   if (Changed)
     return PreservedAnalyses::none();
 
@@ -4286,12 +4727,17 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
   OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
                                 /*CGSCC*/ Functions, Kernels);
 
-  unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
+  unsigned MaxFixpointIterations =
+      (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
   Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
                MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(false);
+
+  if (PrintModuleAfterOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
+
   if (Changed)
     return PreservedAnalyses::none();
 
@@ -4352,12 +4798,18 @@ struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
                                   Allocator,
                                   /*CGSCC*/ Functions, Kernels);
 
-    unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
+    unsigned MaxFixpointIterations =
+        (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
     Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
                  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
     OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
-    return OMPOpt.run(false);
+    bool Result = OMPOpt.run(false);
+
+    if (PrintModuleAfterOptimizations)
+      LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
+
+    return Result;
   }
 
   bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
index d517de38ace3..7402e399a88a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -441,9 +441,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(
   };
 
   auto BBProfileCount = [BFI](BasicBlock *BB) {
-    return BFI->getBlockProfileCount(BB)
-               ? BFI->getBlockProfileCount(BB).getValue()
-               : 0;
+    return BFI->getBlockProfileCount(BB).getValueOr(0);
   };
 
   // Use the same computeBBInlineCost function to compute the cost savings of
@@ -1413,7 +1411,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
 
   uint64_t CalleeEntryCountV =
-      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
+      (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
 
   bool AnyInline = false;
   for (User *User : Users) {
@@ -1461,8 +1459,8 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   if (AnyInline) {
     Cloner.IsFunctionInlined = true;
     if (CalleeEntryCount)
-      Cloner.OrigFunc->setEntryCount(
-          CalleeEntryCount.setCount(CalleeEntryCountV));
+      Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
+          CalleeEntryCountV, CalleeEntryCount->getType()));
     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
     OrigFuncORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index aa916345954d..74f68531b89a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -437,6 +437,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
+  // The matrix extension can introduce large vector operations early, which can
+  // benefit from running vector-combine early on.
+  if (EnableMatrix)
+    MPM.add(createVectorCombinePass());
+
   // Begin the loop pass pipeline.
   if (EnableSimpleLoopUnswitch) {
     // The simple loop unswitch pass relies on separate cleanup passes. Schedule
@@ -1012,7 +1017,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
         createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
 
     // Propage constant function arguments by specializing the functions.
-    if (EnableFunctionSpecialization)
+    if (EnableFunctionSpecialization && OptLevel > 2)
       PM.add(createFunctionSpecializationPass());
 
     // Propagate constants at call sites into the functions they call.  This
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp
index 081398a390fa..5779553ee732 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -135,6 +135,7 @@ PreservedAnalyses FunctionSpecializationPass::run(Module &M,
   return PA;
 }
 
+namespace {
 struct FunctionSpecializationLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
   FunctionSpecializationLegacyPass() : ModulePass(ID) {}
@@ -175,6 +176,7 @@ struct FunctionSpecializationLegacyPass : public ModulePass {
     return runFunctionSpecialization(M, DL, GetTLI, GetTTI, GetAC, GetAnalysis);
   }
 };
+} // namespace
 
 char FunctionSpecializationLegacyPass::ID = 0;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 55b88ac14da5..bae9a1e27e75 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
   if (CalleeName.empty())
     return getHottestChildContext(CallSite);
 
-  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = nodeHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
   if (It != AllChildContext.end())
     return &It->second;
@@ -64,8 +64,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
 
 ContextTrieNode &ContextTrieNode::moveToChildContext(
     const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
-    StringRef ContextStrToRemove, bool DeleteNode) {
-  uint32_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
+    uint32_t ContextFramesToRemove, bool DeleteNode) {
+  uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
   assert(!AllChildContext.count(Hash) && "Node to remove must exist");
   LineLocation OldCallSite = NodeToMove.CallSiteLoc;
   ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
@@ -86,10 +86,10 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
     FunctionSamples *FSamples = Node->getFunctionSamples();
 
     if (FSamples) {
-      FSamples->getContext().promoteOnPath(ContextStrToRemove);
+      FSamples->getContext().promoteOnPath(ContextFramesToRemove);
       FSamples->getContext().setState(SyntheticContext);
-      LLVM_DEBUG(dbgs() << "  Context promoted to: " << FSamples->getContext()
-                        << "\n");
+      LLVM_DEBUG(dbgs() << "  Context promoted to: "
+                        << FSamples->getContext().toString() << "\n");
     }
 
     for (auto &It : Node->getAllChildContext()) {
@@ -108,12 +108,12 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
 
 void ContextTrieNode::removeChildContext(const LineLocation &CallSite,
                                          StringRef CalleeName) {
-  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = nodeHash(CalleeName, CallSite);
   // Note this essentially calls dtor and destroys that child context
   AllChildContext.erase(Hash);
 }
 
-std::map<uint32_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() {
+std::map<uint64_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() {
   return AllChildContext;
 }
 
@@ -127,6 +127,15 @@ void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
   FuncSamples = FSamples;
 }
 
+Optional<uint32_t> ContextTrieNode::getFunctionSize() const { return FuncSize; }
+
+void ContextTrieNode::addFunctionSize(uint32_t FSize) {
+  if (!FuncSize.hasValue())
+    FuncSize = 0;
+
+  FuncSize = FuncSize.getValue() + FSize;
+}
+
 LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; }
 
 ContextTrieNode *ContextTrieNode::getParentContext() const {
@@ -137,9 +146,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
   ParentContext = Parent;
 }
 
-void ContextTrieNode::dump() {
+void ContextTrieNode::dumpNode() {
   dbgs() << "Node: " << FuncName << "\n"
          << "  Callsite: " << CallSiteLoc << "\n"
+         << "  Size: " << FuncSize << "\n"
          << "  Children:\n";
 
   for (auto &It : AllChildContext) {
@@ -147,20 +157,38 @@ void ContextTrieNode::dump() {
   }
 }
 
-uint32_t ContextTrieNode::nodeHash(StringRef ChildName,
+void ContextTrieNode::dumpTree() {
+  dbgs() << "Context Profile Tree:\n";
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(this);
+
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Node->dumpNode();
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      NodeQueue.push(ChildNode);
+    }
+  }
+}
+
+uint64_t ContextTrieNode::nodeHash(StringRef ChildName,
                                    const LineLocation &Callsite) {
   // We still use child's name for child hash, this is
   // because for children of root node, we don't have
   // different line/discriminator, and we'll rely on name
   // to differentiate children.
-  uint32_t NameHash = std::hash<std::string>{}(ChildName.str());
-  uint32_t LocId = (Callsite.LineOffset << 16) | Callsite.Discriminator;
+  uint64_t NameHash = std::hash<std::string>{}(ChildName.str());
+  uint64_t LocId =
+      (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator;
   return NameHash + (LocId << 5) + LocId;
 }
 
 ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
     const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) {
-  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = nodeHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
   if (It != AllChildContext.end()) {
     assert(It->second.getFuncName() == CalleeName &&
@@ -177,13 +205,16 @@ ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
 
 // Profiler tracker than manages profiles and its associated context
 SampleContextTracker::SampleContextTracker(
-    StringMap<FunctionSamples> &Profiles) {
+    SampleProfileMap &Profiles,
+    const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap)
+    : GUIDToFuncNameMap(GUIDToFuncNameMap) {
   for (auto &FuncSample : Profiles) {
     FunctionSamples *FSamples = &FuncSample.second;
-    SampleContext Context(FuncSample.first(), RawContext);
-    LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n");
+    SampleContext Context = FuncSample.first;
+    LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context.toString()
+                      << "\n");
     if (!Context.isBaseContext())
-      FuncToCtxtProfiles[Context.getNameWithoutContext()].push_back(FSamples);
+      FuncToCtxtProfiles[Context.getName()].insert(FSamples);
     ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
     assert(!NewNode->getFunctionSamples() &&
            "New node can't have sample profile");
@@ -200,6 +231,10 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
     return nullptr;
 
   CalleeName = FunctionSamples::getCanonicalFnName(CalleeName);
+  // Convert real function names to MD5 names, if the input profile is
+  // MD5-based.
+  std::string FGUID;
+  CalleeName = getRepInFormat(CalleeName, FunctionSamples::UseMD5, FGUID);
 
   // For indirect call, CalleeName will be empty, in which case the context
   // profile for callee with largest total samples will be returned.
@@ -207,7 +242,8 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
   if (CalleeContext) {
     FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
     LLVM_DEBUG(if (FSamples) {
-      dbgs() << "  Callee context found: " << FSamples->getContext() << "\n";
+      dbgs() << "  Callee context found: " << FSamples->getContext().toString()
+             << "\n";
     });
     return FSamples;
   }
@@ -285,6 +321,11 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
 FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
                                                          bool MergeContext) {
   LLVM_DEBUG(dbgs() << "Getting base profile for function: " << Name << "\n");
+  // Convert real function names to MD5 names, if the input profile is
+  // MD5-based.
+  std::string FGUID;
+  Name = getRepInFormat(Name, FunctionSamples::UseMD5, FGUID);
+
   // Base profile is top-level node (child of root node), so try to retrieve
   // existing top-level node for given function first. If it exists, it could be
   // that we've merged base profile before, or there's actually context-less
@@ -299,14 +340,14 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
     // into base profile.
     for (auto *CSamples : FuncToCtxtProfiles[Name]) {
       SampleContext &Context = CSamples->getContext();
-      ContextTrieNode *FromNode = getContextFor(Context);
-      if (FromNode == Node)
-        continue;
-
       // Skip inlined context profile and also don't re-merge any context
       if (Context.hasState(InlinedContext) || Context.hasState(MergedContext))
         continue;
 
+      ContextTrieNode *FromNode = getContextFor(Context);
+      if (FromNode == Node)
+        continue;
+
       ContextTrieNode &ToNode = promoteMergeContextSamplesTree(*FromNode);
       assert((!Node || Node == &ToNode) && "Expect only one base profile");
       Node = &ToNode;
@@ -324,7 +365,7 @@ void SampleContextTracker::markContextSamplesInlined(
     const FunctionSamples *InlinedSamples) {
   assert(InlinedSamples && "Expect non-null inlined samples");
   LLVM_DEBUG(dbgs() << "Marking context profile as inlined: "
-                    << InlinedSamples->getContext() << "\n");
+                    << InlinedSamples->getContext().toString() << "\n");
   InlinedSamples->getContext().setState(InlinedContext);
 }
 
@@ -376,30 +417,23 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
   FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples();
   assert(FromSamples && "Shouldn't promote a context without profile");
   LLVM_DEBUG(dbgs() << "  Found context tree root to promote: "
-                    << FromSamples->getContext() << "\n");
+                    << FromSamples->getContext().toString() << "\n");
 
   assert(!FromSamples->getContext().hasState(InlinedContext) &&
          "Shouldn't promote inlined context profile");
-  StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
+  uint32_t ContextFramesToRemove =
+      FromSamples->getContext().getContextFrames().size() - 1;
   return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
-                                        ContextStrToRemove);
+                                        ContextFramesToRemove);
 }
 
-void SampleContextTracker::dump() {
-  dbgs() << "Context Profile Tree:\n";
-  std::queue<ContextTrieNode *> NodeQueue;
-  NodeQueue.push(&RootContext);
-
-  while (!NodeQueue.empty()) {
-    ContextTrieNode *Node = NodeQueue.front();
-    NodeQueue.pop();
-    Node->dump();
+void SampleContextTracker::dump() { RootContext.dumpTree(); }
 
-    for (auto &It : Node->getAllChildContext()) {
-      ContextTrieNode *ChildNode = &It.second;
-      NodeQueue.push(ChildNode);
-    }
-  }
+StringRef SampleContextTracker::getFuncNameFor(ContextTrieNode *Node) const {
+  if (!FunctionSamples::UseMD5)
+    return Node->getFuncName();
+  assert(GUIDToFuncNameMap && "GUIDToFuncNameMap needs to be populated first");
+  return GUIDToFuncNameMap->lookup(std::stoull(Node->getFuncName().data()));
 }
 
 ContextTrieNode *
@@ -444,11 +478,22 @@ ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
     RootName = PrevDIL->getScope()->getSubprogram()->getName();
   S.push_back(std::make_pair(LineLocation(0, 0), RootName));
 
+  // Convert real function names to MD5 names, if the input profile is
+  // MD5-based.
+  std::list<std::string> MD5Names;
+  if (FunctionSamples::UseMD5) {
+    for (auto &Location : S) {
+      MD5Names.emplace_back();
+      getRepInFormat(Location.second, FunctionSamples::UseMD5, MD5Names.back());
+      Location.second = MD5Names.back();
+    }
+  }
+
   ContextTrieNode *ContextNode = &RootContext;
   int I = S.size();
   while (--I >= 0 && ContextNode) {
     LineLocation &CallSite = S[I].first;
-    StringRef &CalleeName = S[I].second;
+    StringRef CalleeName = S[I].second;
     ContextNode = ContextNode->getChildContext(CallSite, CalleeName);
   }
 
@@ -462,27 +507,18 @@ ContextTrieNode *
 SampleContextTracker::getOrCreateContextPath(const SampleContext &Context,
                                              bool AllowCreate) {
   ContextTrieNode *ContextNode = &RootContext;
-  StringRef ContextRemain = Context;
-  StringRef ChildContext;
-  StringRef CalleeName;
   LineLocation CallSiteLoc(0, 0);
 
-  while (ContextNode && !ContextRemain.empty()) {
-    auto ContextSplit = SampleContext::splitContextString(ContextRemain);
-    ChildContext = ContextSplit.first;
-    ContextRemain = ContextSplit.second;
-    LineLocation NextCallSiteLoc(0, 0);
-    SampleContext::decodeContextString(ChildContext, CalleeName,
-                                       NextCallSiteLoc);
-
+  for (auto &Callsite : Context.getContextFrames()) {
     // Create child node at parent line/disc location
     if (AllowCreate) {
       ContextNode =
-          ContextNode->getOrCreateChildContext(CallSiteLoc, CalleeName);
+          ContextNode->getOrCreateChildContext(CallSiteLoc, Callsite.FuncName);
     } else {
-      ContextNode = ContextNode->getChildContext(CallSiteLoc, CalleeName);
+      ContextNode =
+          ContextNode->getChildContext(CallSiteLoc, Callsite.FuncName);
     }
-    CallSiteLoc = NextCallSiteLoc;
+    CallSiteLoc = Callsite.Location;
   }
 
   assert((!AllowCreate || ContextNode) &&
@@ -502,7 +538,7 @@ ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) {
 
 void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
                                             ContextTrieNode &ToNode,
-                                            StringRef ContextStrToRemove) {
+                                            uint32_t ContextFramesToRemove) {
   FunctionSamples *FromSamples = FromNode.getFunctionSamples();
   FunctionSamples *ToSamples = ToNode.getFunctionSamples();
   if (FromSamples && ToSamples) {
@@ -510,19 +546,21 @@ void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
     ToSamples->merge(*FromSamples);
     ToSamples->getContext().setState(SyntheticContext);
     FromSamples->getContext().setState(MergedContext);
+    if (FromSamples->getContext().hasAttribute(ContextShouldBeInlined))
+      ToSamples->getContext().setAttribute(ContextShouldBeInlined);
   } else if (FromSamples) {
     // Transfer FromSamples from FromNode to ToNode
     ToNode.setFunctionSamples(FromSamples);
     FromSamples->getContext().setState(SyntheticContext);
-    FromSamples->getContext().promoteOnPath(ContextStrToRemove);
+    FromSamples->getContext().promoteOnPath(ContextFramesToRemove);
     FromNode.setFunctionSamples(nullptr);
   }
 }
 
 ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
     ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent,
-    StringRef ContextStrToRemove) {
-  assert(!ContextStrToRemove.empty() && "Context to remove can't be empty");
+    uint32_t ContextFramesToRemove) {
+  assert(ContextFramesToRemove && "Context to remove can't be empty");
 
   // Ignore call site location if destination is top level under root
   LineLocation NewCallSiteLoc = LineLocation(0, 0);
@@ -540,21 +578,21 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
     // Do not delete node to move from its parent here because
     // caller is iterating over children of that parent node.
     ToNode = &ToNodeParent.moveToChildContext(
-        NewCallSiteLoc, std::move(FromNode), ContextStrToRemove, false);
+        NewCallSiteLoc, std::move(FromNode), ContextFramesToRemove, false);
   } else {
     // Destination node exists, merge samples for the context tree
-    mergeContextNode(FromNode, *ToNode, ContextStrToRemove);
+    mergeContextNode(FromNode, *ToNode, ContextFramesToRemove);
     LLVM_DEBUG({
       if (ToNode->getFunctionSamples())
         dbgs() << "  Context promoted and merged to: "
-               << ToNode->getFunctionSamples()->getContext() << "\n";
+               << ToNode->getFunctionSamples()->getContext().toString() << "\n";
     });
 
     // Recursively promote and merge children
     for (auto &It : FromNode.getAllChildContext()) {
       ContextTrieNode &FromChildNode = It.second;
       promoteMergeContextSamplesTree(FromChildNode, *ToNode,
-                                     ContextStrToRemove);
+                                     ContextFramesToRemove);
     }
 
     // Remove children once they're all merged
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 8e9c79fc7bbb..a961c47a7501 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -143,6 +143,12 @@ static cl::opt<bool> ProfileSampleAccurate(
              "callsite and function as having 0 samples. Otherwise, treat "
              "un-sampled callsites and functions conservatively as unknown. "));
 
+static cl::opt<bool> ProfileSampleBlockAccurate(
+    "profile-sample-block-accurate", cl::Hidden, cl::init(false),
+    cl::desc("If the sample profile is accurate, we will mark all un-sampled "
+             "branches and calls as having 0 samples. Otherwise, treat "
+             "them conservatively as unknown. "));
+
 static cl::opt<bool> ProfileAccurateForSymsInList(
     "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
     cl::init(true),
@@ -214,6 +220,16 @@ static cl::opt<bool> CallsitePrioritizedInline(
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
+static cl::opt<bool> UsePreInlinerDecision(
+    "sample-profile-use-preinliner", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Use the preinliner decisions stored in profile context."));
+
+static cl::opt<bool> AllowRecursiveInline(
+    "sample-profile-recursive-inline", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Allow sample loader inliner to inline recursive calls."));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -221,6 +237,50 @@ static cl::opt<std::string> ProfileInlineReplayFile(
         "by inlining from sample profile loader."),
     cl::Hidden);
 
+static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope(
+    "sample-profile-inline-replay-scope",
+    cl::init(ReplayInlinerSettings::Scope::Function),
+    cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
+                          "Replay on functions that have remarks associated "
+                          "with them (default)"),
+               clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
+                          "Replay on the entire module")),
+    cl::desc("Whether inline replay should be applied to the entire "
+             "Module or just the Functions (default) that are present as "
+             "callers in remarks during sample profile inlining."),
+    cl::Hidden);
+
+static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback(
+    "sample-profile-inline-replay-fallback",
+    cl::init(ReplayInlinerSettings::Fallback::Original),
+    cl::values(
+        clEnumValN(
+            ReplayInlinerSettings::Fallback::Original, "Original",
+            "All decisions not in replay send to original advisor (default)"),
+        clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
+                   "AlwaysInline", "All decisions not in replay are inlined"),
+        clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
+                   "All decisions not in replay are not inlined")),
+    cl::desc("How sample profile inline replay treats sites that don't come "
+             "from the replay. Original: defers to original advisor, "
+             "AlwaysInline: inline all sites not in replay, NeverInline: "
+             "inline no sites not in replay"),
+    cl::Hidden);
+
+static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat(
+    "sample-profile-inline-replay-format",
+    cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
+    cl::values(
+        clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
+        clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
+                   "<Line Number>:<Column Number>"),
+        clEnumValN(CallSiteFormat::Format::LineDiscriminator,
+                   "LineDiscriminator", "<Line Number>.<Discriminator>"),
+        clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
+                   "LineColumnDiscriminator",
+                   "<Line Number>:<Column Number>.<Discriminator> (default)")),
+    cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
+
 static cl::opt<unsigned>
     MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
                      cl::ZeroOrMore,
@@ -358,10 +418,10 @@ public:
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
       std::function<const TargetLibraryInfo &(Function &)> GetTLI)
-      : SampleProfileLoaderBaseImpl(std::string(Name)),
+      : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)),
         GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
-        RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
+        LTOPhase(LTOPhase) {}
 
   bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -377,7 +437,7 @@ protected:
   findFunctionSamples(const Instruction &I) const override;
   std::vector<const FunctionSamples *>
   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
-  void findExternalInlineCandidate(const FunctionSamples *Samples,
+  void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
                                    DenseSet<GlobalValue::GUID> &InlinedGUIDs,
                                    const StringMap<Function *> &SymbolMap,
                                    uint64_t Threshold);
@@ -385,8 +445,11 @@ protected:
   bool tryPromoteAndInlineCandidate(
       Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
       uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
+
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  Optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
+  bool getExternalInlineAdvisorShouldInline(CallBase &CB);
   InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
   bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
   bool
@@ -417,9 +480,6 @@ protected:
   /// Profile tracker for different context.
   std::unique_ptr<SampleContextTracker> ContextTracker;
 
-  /// Name of the profile remapping file to load.
-  std::string RemappingFilename;
-
   /// Flag indicating whether input profile is context-sensitive
   bool ProfileIsCS = false;
 
@@ -464,7 +524,7 @@ protected:
   bool ProfAccForSymsInList;
 
   // External inline advisor used to replay inline decision from remarks.
-  std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
+  std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
 
   // A pseudo probe helper to correlate the imported sample counts.
   std::unique_ptr<PseudoProbeManager> ProbeManager;
@@ -953,8 +1013,24 @@ void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
 }
 
 void SampleProfileLoader::findExternalInlineCandidate(
-    const FunctionSamples *Samples, DenseSet<GlobalValue::GUID> &InlinedGUIDs,
+    CallBase *CB, const FunctionSamples *Samples,
+    DenseSet<GlobalValue::GUID> &InlinedGUIDs,
     const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
+
+  // If ExternalInlineAdvisor wants to inline an external function
+  // make sure it's imported
+  if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
+    // Samples may not exist for replayed function, if so
+    // just add the direct GUID and move on
+    if (!Samples) {
+      InlinedGUIDs.insert(
+          FunctionSamples::getGUID(CB->getCalledFunction()->getName()));
+      return;
+    }
+    // Otherwise, drop the threshold to import everything that we can
+    Threshold = 0;
+  }
+
   assert(Samples && "expect non-null caller profile");
 
   // For AutoFDO profile, retrieve candidate profiles by walking over
@@ -975,14 +1051,21 @@ void SampleProfileLoader::findExternalInlineCandidate(
     // For CSSPGO profile, retrieve candidate profile by walking over the
     // trie built for context profile. Note that also take call targets
     // even if callee doesn't have a corresponding context profile.
-    if (!CalleeSample || CalleeSample->getEntrySamples() < Threshold)
+    if (!CalleeSample)
+      continue;
+
+    // If pre-inliner decision is used, honor that for importing as well.
+    bool PreInline =
+        UsePreInlinerDecision &&
+        CalleeSample->getContext().hasAttribute(ContextShouldBeInlined);
+    if (!PreInline && CalleeSample->getEntrySamples() < Threshold)
       continue;
 
     StringRef Name = CalleeSample->getFuncName();
     Function *Func = SymbolMap.lookup(Name);
     // Add to the import list only when it's defined out of module.
     if (!Func || Func->isDeclaration())
-      InlinedGUIDs.insert(FunctionSamples::getGUID(Name));
+      InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName()));
 
     // Import hot CallTargets, which may not be available in IR because full
     // profile annotation cannot be done until backend compilation in ThinLTO.
@@ -992,7 +1075,7 @@ void SampleProfileLoader::findExternalInlineCandidate(
           StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
           const Function *Callee = SymbolMap.lookup(CalleeName);
           if (!Callee || Callee->isDeclaration())
-            InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeName));
+            InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey()));
         }
 
     // Import hot child context profile associted with callees. Note that this
@@ -1042,16 +1125,20 @@ bool SampleProfileLoader::inlineHotFunctions(
       for (auto &I : BB.getInstList()) {
         const FunctionSamples *FS = nullptr;
         if (auto *CB = dyn_cast<CallBase>(&I)) {
-          if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
-            assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
-                   "GUIDToFuncNameMap has to be populated");
-            AllCandidates.push_back(CB);
-            if (FS->getEntrySamples() > 0 || ProfileIsCS)
-              LocalNotInlinedCallSites.try_emplace(CB, FS);
-            if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
-              Hot = true;
-            else if (shouldInlineColdCallee(*CB))
-              ColdCandidates.push_back(CB);
+          if (!isa<IntrinsicInst>(I)) {
+            if ((FS = findCalleeFunctionSamples(*CB))) {
+              assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
+                     "GUIDToFuncNameMap has to be populated");
+              AllCandidates.push_back(CB);
+              if (FS->getEntrySamples() > 0 || ProfileIsCS)
+                LocalNotInlinedCallSites.try_emplace(CB, FS);
+              if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
+                Hot = true;
+              else if (shouldInlineColdCallee(*CB))
+                ColdCandidates.push_back(CB);
+            } else if (getExternalInlineAdvisorShouldInline(*CB)) {
+              AllCandidates.push_back(CB);
+            }
           }
         }
       }
@@ -1078,7 +1165,7 @@ bool SampleProfileLoader::inlineHotFunctions(
         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
           uint64_t SumOrigin = Sum;
           if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-            findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap,
+            findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
                                         PSI->getOrCompHotCountThreshold());
             continue;
           }
@@ -1098,8 +1185,8 @@ bool SampleProfileLoader::inlineHotFunctions(
           LocalChanged = true;
         }
       } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-        findExternalInlineCandidate(findCalleeFunctionSamples(*I), InlinedGUIDs,
-                                    SymbolMap,
+        findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
+                                    InlinedGUIDs, SymbolMap,
                                     PSI->getOrCompHotCountThreshold());
       }
     }
@@ -1184,8 +1271,8 @@ bool SampleProfileLoader::tryInlineCandidate(
                                                *CalledFunction);
 
     // The call to InlineFunction erases I, so we can't pass it here.
-    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
-                    true, CSINLINE_DEBUG);
+    emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction,
+                               *BB->getParent(), Cost, true, CSINLINE_DEBUG);
 
     // Now populate the list of newly exposed call sites.
     if (InlinedCallSites) {
@@ -1228,7 +1315,9 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
 
   // Find the callee's profile. For indirect call, find hottest target profile.
   const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
-  if (!CalleeSamples)
+  // If ExternalInlineAdvisor wants to inline this site, do so even
+  // if Samples are not present.
+  if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
     return false;
 
   float Factor = 1.0;
@@ -1247,19 +1336,34 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
   return true;
 }
 
-InlineCost
-SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+Optional<InlineCost>
+SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
   std::unique_ptr<InlineAdvice> Advice = nullptr;
   if (ExternalInlineAdvisor) {
-    Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
-    if (!Advice->isInliningRecommended()) {
-      Advice->recordUnattemptedInlining();
-      return InlineCost::getNever("not previously inlined");
+    Advice = ExternalInlineAdvisor->getAdvice(CB);
+    if (Advice) {
+      if (!Advice->isInliningRecommended()) {
+        Advice->recordUnattemptedInlining();
+        return InlineCost::getNever("not previously inlined");
+      }
+      Advice->recordInlining();
+      return InlineCost::getAlways("previously inlined");
     }
-    Advice->recordInlining();
-    return InlineCost::getAlways("previously inlined");
   }
 
+  return {};
+}
+
+bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
+  Optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
+  return Cost ? !!Cost.getValue() : false;
+}
+
+InlineCost
+SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+  if (Optional<InlineCost> ReplayCost =
+          getExternalInlineAdvisorCost(*Candidate.CallInstr))
+    return ReplayCost.getValue();
   // Adjust threshold based on call site hotness, only do this for callsite
   // prioritized inliner because otherwise cost-benefit check is done earlier.
   int SampleThreshold = SampleColdCallSiteThreshold;
@@ -1274,7 +1378,9 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   assert(Callee && "Expect a definition for inline candidate of direct call");
 
   InlineParams Params = getInlineParams();
+  // We will ignore the threshold from inline cost, so always get full cost.
   Params.ComputeFullInlineCost = true;
+  Params.AllowRecursiveCall = AllowRecursiveInline;
   // Checks if there is anything in the reachable portion of the callee at
   // this callsite that makes this inlining potentially illegal. Need to
   // set ComputeFullInlineCost, otherwise getInlineCost may return early
@@ -1288,6 +1394,25 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   if (Cost.isNever() || Cost.isAlways())
     return Cost;
 
+  // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
+  // decisions based on hotness as well as accurate function byte sizes for
+  // given context using function/inlinee sizes from previous build. It
+  // stores the decision in profile, and also adjust/merge context profile
+  // aiming at better context-sensitive post-inline profile quality, assuming
+  // all inline decision estimates are going to be honored by compiler. Here
+  // we replay that inline decision under `sample-profile-use-preinliner`.
+  // Note that we don't need to handle negative decision from preinliner as
+  // context profile for not inlined calls are merged by preinliner already.
+  if (UsePreInlinerDecision && Candidate.CalleeSamples) {
+    // Once two node are merged due to promotion, we're losing some context
+    // so the original context-sensitive preinliner decision should be ignored
+    // for SyntheticContext.
+    SampleContext &Context = Candidate.CalleeSamples->getContext();
+    if (!Context.hasState(SyntheticContext) &&
+        Context.hasAttribute(ContextShouldBeInlined))
+      return InlineCost::getAlways("preinliner");
+  }
+
   // For old FDO inliner, we inline the call site as long as cost is not
   // "Never". The cost-benefit check is done earlier.
   if (!CallsitePrioritizedInline) {
@@ -1357,7 +1482,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
       for (const auto *FS : CalleeSamples) {
         // TODO: Consider disable pre-lTO ICP for MonoLTO as well
         if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-          findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap,
+          findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
                                       PSI->getOrCompHotCountThreshold());
           continue;
         }
@@ -1405,8 +1530,9 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
         Changed = true;
       }
     } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-      findExternalInlineCandidate(Candidate.CalleeSamples, InlinedGUIDs,
-                                  SymbolMap, PSI->getOrCompHotCountThreshold());
+      findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
+                                  InlinedGUIDs, SymbolMap,
+                                  PSI->getOrCompHotCountThreshold());
     }
   }
 
@@ -1494,7 +1620,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
                             {static_cast<uint32_t>(BlockWeights[BB])}));
         }
       }
-    } else if (OverwriteExistingWeights) {
+    } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {
       // Set profile metadata (possibly annotated by LTO prelink) to zero or
       // clear it for cold code.
       for (auto &I : BB->getInstList()) {
@@ -1792,11 +1918,13 @@ bool SampleProfileLoader::doInitialization(Module &M,
   }
 
   if (FAM && !ProfileInlineReplayFile.empty()) {
-    ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
-        M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
+    ExternalInlineAdvisor = getReplayInlineAdvisor(
+        M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
+        ReplayInlinerSettings{ProfileInlineReplayFile,
+                              ProfileInlineReplayScope,
+                              ProfileInlineReplayFallback,
+                              {ProfileInlineReplayFormat}},
         /*EmitRemarks=*/false);
-    if (!ExternalInlineAdvisor->areReplayRemarksLoaded())
-      ExternalInlineAdvisor.reset();
   }
 
   // Apply tweaks if context-sensitive profile is available.
@@ -1810,13 +1938,21 @@ bool SampleProfileLoader::doInitialization(Module &M,
     if (!CallsitePrioritizedInline.getNumOccurrences())
       CallsitePrioritizedInline = true;
 
+    // For CSSPGO, use preinliner decision by default when available.
+    if (!UsePreInlinerDecision.getNumOccurrences())
+      UsePreInlinerDecision = true;
+
+    // For CSSPGO, we also allow recursive inline to best use context profile.
+    if (!AllowRecursiveInline.getNumOccurrences())
+      AllowRecursiveInline = true;
+
     // Enable iterative-BFI by default for CSSPGO.
     if (!UseIterativeBFIInference.getNumOccurrences())
       UseIterativeBFIInference = true;
 
     // Tracker for profiles under different context
-    ContextTracker =
-        std::make_unique<SampleContextTracker>(Reader->getProfiles());
+    ContextTracker = std::make_unique<SampleContextTracker>(
+        Reader->getProfiles(), &GUIDToFuncNameMap);
   }
 
   // Load pseudo probe descriptors for probe-based function samples.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 08d316337ef5..21395460bccb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -415,9 +415,7 @@ void PseudoProbeUpdatePass::runOnFunction(Function &F,
                                           FunctionAnalysisManager &FAM) {
   BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto BBProfileCount = [&BFI](BasicBlock *BB) {
-    return BFI.getBlockProfileCount(BB)
-               ? BFI.getBlockProfileCount(BB).getValue()
-               : 0;
+    return BFI.getBlockProfileCount(BB).getValueOr(0);
   };
 
   // Collect the sum of execution weight for each probe.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 655a7a404951..0f2412dce1c9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -30,23 +30,20 @@ static bool stripDeadPrototypes(Module &M) {
   bool MadeChange = false;
 
   // Erase dead function prototypes.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    Function *F = &*I++;
+  for (Function &F : llvm::make_early_inc_range(M)) {
     // Function must be a prototype and unused.
-    if (F->isDeclaration() && F->use_empty()) {
-      F->eraseFromParent();
+    if (F.isDeclaration() && F.use_empty()) {
+      F.eraseFromParent();
       ++NumDeadPrototypes;
       MadeChange = true;
     }
   }
 
   // Erase dead global var prototypes.
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ) {
-    GlobalVariable *GV = &*I++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     // Global must be a prototype and unused.
-    if (GV->isDeclaration() && GV->use_empty())
-      GV->eraseFromParent();
+    if (GV.isDeclaration() && GV.use_empty())
+      GV.eraseFromParent();
   }
 
   // Return an indication of whether we changed anything or not.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 168740a1158e..9d4e9464f361 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -214,13 +214,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
   findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
 
   for (GlobalVariable &GV : M.globals()) {
-    if (GV.hasLocalLinkage() && llvmUsedValues.count(&GV) == 0)
+    if (GV.hasLocalLinkage() && !llvmUsedValues.contains(&GV))
       if (!PreserveDbgInfo || !GV.getName().startswith("llvm.dbg"))
         GV.setName(""); // Internal symbols can't participate in linkage
   }
 
   for (Function &I : M) {
-    if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
+    if (I.hasLocalLinkage() && !llvmUsedValues.contains(&I))
       if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
         I.setName(""); // Internal symbols can't participate in linkage
     if (auto *Symtab = I.getValueSymbolTable())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index eea848d3eb2f..0cc1b37844f6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -164,8 +164,7 @@ void simplifyExternals(Module &M) {
   FunctionType *EmptyFT =
       FunctionType::get(Type::getVoidTy(M.getContext()), false);
 
-  for (auto I = M.begin(), E = M.end(); I != E;) {
-    Function &F = *I++;
+  for (Function &F : llvm::make_early_inc_range(M)) {
     if (F.isDeclaration() && F.use_empty()) {
       F.eraseFromParent();
       continue;
@@ -181,16 +180,15 @@ void simplifyExternals(Module &M) {
                          F.getAddressSpace(), "", &M);
     NewF->copyAttributesFrom(&F);
     // Only copy function attribtues.
-    NewF->setAttributes(
-        AttributeList::get(M.getContext(), AttributeList::FunctionIndex,
-                           F.getAttributes().getFnAttributes()));
+    NewF->setAttributes(AttributeList::get(M.getContext(),
+                                           AttributeList::FunctionIndex,
+                                           F.getAttributes().getFnAttrs()));
     NewF->takeName(&F);
     F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
     F.eraseFromParent();
   }
 
-  for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
-    GlobalVariable &GV = *I++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     if (GV.isDeclaration() && GV.use_empty()) {
       GV.eraseFromParent();
       continue;
@@ -325,7 +323,8 @@ void splitAndWriteThinLTOBitcode(
             return true;
         if (auto *F = dyn_cast<Function>(GV))
           return EligibleVirtualFns.count(F);
-        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+        if (auto *GVar =
+                dyn_cast_or_null<GlobalVariable>(GV->getAliaseeObject()))
           return HasTypeMetadata(GVar);
         return false;
       }));
@@ -354,7 +353,7 @@ void splitAndWriteThinLTOBitcode(
   // Remove all globals with type metadata, globals with comdats that live in
   // MergedM, and aliases pointing to such globals from the thin LTO module.
   filterModule(&M, [&](const GlobalValue *GV) {
-    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getAliaseeObject()))
       if (HasTypeMetadata(GVar))
         return false;
     if (const auto *C = GV->getComdat())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 7a8946110785..61054e7ae46f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1288,7 +1288,7 @@ void DevirtModule::tryICallBranchFunnel(
                           M.getDataLayout().getProgramAddressSpace(),
                           "branch_funnel", &M);
   }
-  JT->addAttribute(1, Attribute::Nest);
+  JT->addParamAttr(0, Attribute::Nest);
 
   std::vector<Value *> JTArgs;
   JTArgs.push_back(JT->arg_begin());
@@ -1361,10 +1361,10 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
           M.getContext(), ArrayRef<Attribute>{Attribute::get(
                               M.getContext(), Attribute::Nest)}));
       for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I)
-        NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+        NewArgAttrs.push_back(Attrs.getParamAttrs(I));
       NewCS->setAttributes(
-          AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
-                             Attrs.getRetAttributes(), NewArgAttrs));
+          AttributeList::get(M.getContext(), Attrs.getFnAttrs(),
+                             Attrs.getRetAttrs(), NewArgAttrs));
 
       CB.replaceAllUsesWith(NewCS);
       CB.eraseFromParent();
@@ -1786,10 +1786,8 @@ void DevirtModule::scanTypeTestUsers(
   // points to a member of the type identifier %md. Group calls by (type ID,
   // offset) pair (effectively the identity of the virtual function) and store
   // to CallSlots.
-  for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
-       I != E;) {
-    auto CI = dyn_cast<CallInst>(I->getUser());
-    ++I;
+  for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
       continue;
 
@@ -1858,11 +1856,8 @@ void DevirtModule::scanTypeTestUsers(
 void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
   Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
 
-  for (auto I = TypeCheckedLoadFunc->use_begin(),
-            E = TypeCheckedLoadFunc->use_end();
-       I != E;) {
-    auto CI = dyn_cast<CallInst>(I->getUser());
-    ++I;
+  for (Use &U : llvm::make_early_inc_range(TypeCheckedLoadFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
       continue;