diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/IPO')
32 files changed, 3598 insertions, 1470 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index 532599b42e0d..01e724e22dcf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -73,8 +73,8 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, }, ORE); assert(OIC); - emitInlinedInto(ORE, CB->getDebugLoc(), CB->getParent(), F, *Caller, - *OIC, false, DEBUG_TYPE); + emitInlinedIntoBasedOnCost(ORE, CB->getDebugLoc(), CB->getParent(), F, + *Caller, *OIC, false, DEBUG_TYPE); InlineFunctionInfo IFI( /*cg=*/nullptr, GetAssumptionCache, &PSI, @@ -108,8 +108,10 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, // Delete the non-comdat ones from the module and also from our vector. auto NonComdatBegin = partition( InlinedFunctions, [&](Function *F) { return F->hasComdat(); }); - for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end())) + for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end())) { M.getFunctionList().erase(F); + Changed = true; + } InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end()); if (!InlinedFunctions.empty()) { @@ -117,8 +119,10 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, // are not actually dead. filterDeadComdatFunctions(M, InlinedFunctions); // The remaining functions are actually dead. - for (Function *F : InlinedFunctions) + for (Function *F : InlinedFunctions) { M.getFunctionList().erase(F); + Changed = true; + } } return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index f670a101767e..93bb11433775 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -148,7 +148,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, } else if (!ArgsToPromote.count(&*I)) { // Unchanged argument Params.push_back(I->getType()); - ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo)); + ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo)); } else if (I->use_empty()) { // Dead argument (which are always marked as promotable) ++NumArgumentsDead; @@ -177,9 +177,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // Since loads will only have a single operand, and GEPs only a single // non-index operand, this will record direct loads without any indices, // and gep+loads with the GEP indices. - for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); - II != IE; ++II) - Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); + for (const Use &I : llvm::drop_begin(UI->operands())) + Indices.push_back(cast<ConstantInt>(I)->getSExtValue()); // GEPs with a single 0 index can be merged with direct loads if (Indices.size() == 1 && Indices.front() == 0) Indices.clear(); @@ -231,8 +230,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // Recompute the parameter attributes list based on the new arguments for // the function. - NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(), - PAL.getRetAttributes(), ArgAttrVec)); + NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(), + PAL.getRetAttrs(), ArgAttrVec)); ArgAttrVec.clear(); F->getParent()->getFunctionList().insert(F->getIterator(), NF); @@ -257,7 +256,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, ++I, ++AI, ++ArgNo) if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { Args.push_back(*AI); // Unmodified argument - ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); + ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo)); } else if (ByValArgsToTransform.count(&*I)) { // Emit a GEP and load for each element of the struct. Type *AgTy = I->getParamByValType(); @@ -313,9 +312,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val"); newLoad->setAlignment(OrigLoad->getAlign()); // Transfer the AA info too. - AAMDNodes AAInfo; - OrigLoad->getAAMetadata(AAInfo); - newLoad->setAAMetadata(AAInfo); + newLoad->setAAMetadata(OrigLoad->getAAMetadata()); Args.push_back(newLoad); ArgAttrVec.push_back(AttributeSet()); @@ -325,7 +322,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // Push any varargs arguments on the list. for (; AI != CB.arg_end(); ++AI, ++ArgNo) { Args.push_back(*AI); - ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); + ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo)); } SmallVector<OperandBundleDef, 1> OpBundles; @@ -341,9 +338,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, NewCS = NewCall; } NewCS->setCallingConv(CB.getCallingConv()); - NewCS->setAttributes( - AttributeList::get(F->getContext(), CallPAL.getFnAttributes(), - CallPAL.getRetAttributes(), ArgAttrVec)); + NewCS->setAttributes(AttributeList::get(F->getContext(), + CallPAL.getFnAttrs(), + CallPAL.getRetAttrs(), ArgAttrVec)); NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); Args.clear(); ArgAttrVec.clear(); @@ -1018,11 +1015,12 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, do { LocalChange = false; + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); + for (LazyCallGraph::Node &N : C) { Function &OldF = N.getFunction(); - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); // FIXME: This lambda must only be used with this function. We should // skip the lambda and just get the AA results directly. auto AARGetter = [&](Function &F) -> AAResults & { @@ -1045,6 +1043,13 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, C.getOuterRefSCC().replaceNodeFunction(N, *NewF); FAM.clear(OldF, OldF.getName()); OldF.eraseFromParent(); + + PreservedAnalyses FuncPA; + FuncPA.preserveSet<CFGAnalyses>(); + for (auto *U : NewF->users()) { + auto *UserF = cast<CallBase>(U)->getFunction(); + FAM.invalidate(*UserF, FuncPA); + } } Changed |= LocalChange; @@ -1053,7 +1058,12 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, if (!Changed) return PreservedAnalyses::all(); - return PreservedAnalyses::none(); + PreservedAnalyses PA; + // We've cleared out analyses for deleted functions. + PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); + // We've manually invalidated analyses for functions we've modified. + PA.preserveSet<AllAnalysesOn<Function>>(); + return PA; } namespace { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp index 91b16ec66ee3..edadc79e3a9f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp @@ -382,30 +382,30 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, if (Attr.isEnumAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); - if (Attrs.hasAttribute(AttrIdx, Kind)) + if (Attrs.hasAttributeAtIndex(AttrIdx, Kind)) if (!ForceReplace && - isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind))) return false; - Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); + Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr); return true; } if (Attr.isStringAttribute()) { StringRef Kind = Attr.getKindAsString(); - if (Attrs.hasAttribute(AttrIdx, Kind)) + if (Attrs.hasAttributeAtIndex(AttrIdx, Kind)) if (!ForceReplace && - isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind))) return false; - Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); + Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr); return true; } if (Attr.isIntAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); - if (Attrs.hasAttribute(AttrIdx, Kind)) + if (Attrs.hasAttributeAtIndex(AttrIdx, Kind)) if (!ForceReplace && - isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind))) return false; - Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind); - Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); + Attrs = Attrs.removeAttributeAtIndex(Ctx, AttrIdx, Kind); + Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr); return true; } @@ -658,9 +658,9 @@ bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK, else AttrList = getAssociatedFunction()->getAttributes(); - bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK); + bool HasAttr = AttrList.hasAttributeAtIndex(getAttrIdx(), AK); if (HasAttr) - Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK)); + Attrs.push_back(AttrList.getAttributeAtIndex(getAttrIdx(), AK)); return HasAttr; } @@ -1043,6 +1043,8 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred, if (auto *SI = dyn_cast<StoreInst>(U->getUser())) { if (&SI->getOperandUse(0) == U) { + if (!Visited.insert(U).second) + continue; SmallSetVector<Value *, 4> PotentialCopies; if (AA::getPotentialCopiesOfStoredValue(*this, *SI, PotentialCopies, QueryingAA, @@ -1121,6 +1123,10 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred, if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) { if (CE->isCast() && CE->getType()->isPointerTy() && CE->getType()->getPointerElementType()->isFunctionTy()) { + LLVM_DEBUG( + dbgs() << "[Attributor] Use, is constant cast expression, add " + << CE->getNumUses() + << " uses of that expression instead!\n"); for (const Use &CEU : CE->uses()) Uses.push_back(&CEU); continue; @@ -1141,9 +1147,13 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred, const Use *EffectiveUse = ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U; if (!ACS.isCallee(EffectiveUse)) { - if (!RequireAllCallSites) + if (!RequireAllCallSites) { + LLVM_DEBUG(dbgs() << "[Attributor] User " << *EffectiveUse->getUser() + << " is not a call of " << Fn.getName() + << ", skip use\n"); continue; - LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser() + } + LLVM_DEBUG(dbgs() << "[Attributor] User " << *EffectiveUse->getUser() << " is an invalid use of " << Fn.getName() << "\n"); return false; } @@ -1413,6 +1423,16 @@ void Attributor::runTillFixpoint() { } while (!Worklist.empty() && (IterationCounter++ < MaxFixedPointIterations || VerifyMaxFixpointIterations)); + if (IterationCounter > MaxFixedPointIterations && !Worklist.empty()) { + auto Remark = [&](OptimizationRemarkMissed ORM) { + return ORM << "Attributor did not reach a fixpoint after " + << ore::NV("Iterations", MaxFixedPointIterations) + << " iterations."; + }; + Function *F = Worklist.front()->getIRPosition().getAssociatedFunction(); + emitRemark<OptimizationRemarkMissed>(F, "FixedPoint", Remark); + } + LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: " << IterationCounter << "/" << MaxFixpointIterations << " iterations\n"); @@ -1922,7 +1942,7 @@ void Attributor::createShallowWrapper(Function &F) { CallInst *CI = CallInst::Create(&F, Args, "", EntryBB); CI->setTailCall(true); - CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline); + CI->addFnAttr(Attribute::NoInline); ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB); NumFnShallowWrappersCreated++; @@ -2015,7 +2035,8 @@ bool Attributor::isValidFunctionSignatureRewrite( if (!RewriteSignatures) return false; - auto CallSiteCanBeChanged = [](AbstractCallSite ACS) { + Function *Fn = Arg.getParent(); + auto CallSiteCanBeChanged = [Fn](AbstractCallSite ACS) { // Forbid the call site to cast the function return type. If we need to // rewrite these functions we need to re-create a cast for the new call site // (if the old had uses). @@ -2023,11 +2044,12 @@ bool Attributor::isValidFunctionSignatureRewrite( ACS.getInstruction()->getType() != ACS.getCalledFunction()->getReturnType()) return false; + if (ACS.getCalledOperand()->getType() != Fn->getType()) + return false; // Forbid must-tail calls for now. return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall(); }; - Function *Fn = Arg.getParent(); // Avoid var-arg functions for now. if (Fn->isVarArg()) { LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n"); @@ -2157,7 +2179,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures( } else { NewArgumentTypes.push_back(Arg.getType()); NewArgumentAttributes.push_back( - OldFnAttributeList.getParamAttributes(Arg.getArgNo())); + OldFnAttributeList.getParamAttrs(Arg.getArgNo())); } } @@ -2188,8 +2210,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures( // the function. LLVMContext &Ctx = OldFn->getContext(); NewFn->setAttributes(AttributeList::get( - Ctx, OldFnAttributeList.getFnAttributes(), - OldFnAttributeList.getRetAttributes(), NewArgumentAttributes)); + Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(), + NewArgumentAttributes)); // Since we have now created the new function, splice the body of the old // function right into the new function, leaving the old rotting hulk of the @@ -2234,7 +2256,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures( } else { NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum)); NewArgOperandAttributes.push_back( - OldCallAttributeList.getParamAttributes(OldArgNum)); + OldCallAttributeList.getParamAttrs(OldArgNum)); } } @@ -2264,8 +2286,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures( NewCB->setCallingConv(OldCB->getCallingConv()); NewCB->takeName(OldCB); NewCB->setAttributes(AttributeList::get( - Ctx, OldCallAttributeList.getFnAttributes(), - OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes)); + Ctx, OldCallAttributeList.getFnAttrs(), + OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes)); CallSitePairs.push_back({OldCB, NewCB}); return true; @@ -2480,6 +2502,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { // Every function can be "readnone/argmemonly/inaccessiblememonly/...". getOrCreateAAFor<AAMemoryLocation>(FPos); + // Every function can track active assumptions. + getOrCreateAAFor<AAAssumptionInfo>(FPos); + // Every function might be applicable for Heap-To-Stack conversion. if (EnableHeapToStack) getOrCreateAAFor<AAHeapToStack>(FPos); @@ -2565,6 +2590,7 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { auto CallSitePred = [&](Instruction &I) -> bool { auto &CB = cast<CallBase>(I); IRPosition CBRetPos = IRPosition::callsite_returned(CB); + IRPosition CBFnPos = IRPosition::callsite_function(CB); // Call sites might be dead if they do not have side effects and no live // users. The return value might be dead if there are no live users. @@ -2576,6 +2602,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { if (!Callee) return true; + // Every call site can track active assumptions. + getOrCreateAAFor<AAAssumptionInfo>(CBFnPos); + // Skip declarations except if annotations on their call sites were // explicitly requested. if (!AnnotateDeclarationCallSites && Callee->isDeclaration() && @@ -2588,7 +2617,7 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { getOrCreateAAFor<AAValueSimplify>(CBRetPos); } - for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) { + for (int I = 0, E = CB.arg_size(); I < E; ++I) { IRPosition CBArgPos = IRPosition::callsite_argument(CB, I); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 3529923a9082..ec08287393de 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -28,6 +29,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Assumptions.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" @@ -146,6 +148,7 @@ PIPE_OPERATOR(AANoUndef) PIPE_OPERATOR(AACallEdges) PIPE_OPERATOR(AAFunctionReachability) PIPE_OPERATOR(AAPointerInfo) +PIPE_OPERATOR(AAAssumptionInfo) #undef PIPE_OPERATOR @@ -203,46 +206,25 @@ static Value *constructPointer(Type *ResTy, Type *PtrElemTy, Value *Ptr, << "-bytes as " << *ResTy << "\n"); if (Offset) { - SmallVector<Value *, 4> Indices; - std::string GEPName = Ptr->getName().str() + ".0"; - - // Add 0 index to look through the pointer. - assert((uint64_t)Offset < DL.getTypeAllocSize(PtrElemTy) && - "Offset out of bounds"); - Indices.push_back(Constant::getNullValue(IRB.getInt32Ty())); - Type *Ty = PtrElemTy; - do { - auto *STy = dyn_cast<StructType>(Ty); - if (!STy) - // Non-aggregate type, we cast and make byte-wise progress now. - break; - - const StructLayout *SL = DL.getStructLayout(STy); - if (int64_t(SL->getSizeInBytes()) < Offset) - break; - - uint64_t Idx = SL->getElementContainingOffset(Offset); - assert(Idx < STy->getNumElements() && "Offset calculation error!"); - uint64_t Rem = Offset - SL->getElementOffset(Idx); - Ty = STy->getElementType(Idx); - - LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset - << " Idx: " << Idx << " Rem: " << Rem << "\n"); + APInt IntOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset); + SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(Ty, IntOffset); - GEPName += "." + std::to_string(Idx); - Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx)); - Offset = Rem; - } while (Offset); + SmallVector<Value *, 4> ValIndices; + std::string GEPName = Ptr->getName().str(); + for (const APInt &Index : IntIndices) { + ValIndices.push_back(IRB.getInt(Index)); + GEPName += "." + std::to_string(Index.getZExtValue()); + } // Create a GEP for the indices collected above. - Ptr = IRB.CreateGEP(PtrElemTy, Ptr, Indices, GEPName); + Ptr = IRB.CreateGEP(PtrElemTy, Ptr, ValIndices, GEPName); // If an offset is left we use byte-wise adjustment. - if (Offset) { + if (IntOffset != 0) { Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy()); - Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt32(Offset), - GEPName + ".b" + Twine(Offset)); + Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(IntOffset), + GEPName + ".b" + Twine(IntOffset.getZExtValue())); } } @@ -431,6 +413,7 @@ const Value *stripAndAccumulateMinimalOffsets( }; return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds, + /* AllowInvariant */ false, AttributorAnalysis); } @@ -503,6 +486,7 @@ static void clampReturnedValueStates( S ^= *T; } +namespace { /// Helper class for generic deduction: return value -> returned position. template <typename AAType, typename BaseType, typename StateType = typename BaseType::StateType, @@ -661,6 +645,7 @@ struct AACallSiteReturnedFromReturned : public BaseType { return clampStateAndIndicateChange(S, AA.getState()); } }; +} // namespace /// Helper function to accumulate uses. template <class AAType, typename StateType = typename AAType::StateType> @@ -1051,6 +1036,7 @@ private: BooleanState BS; }; +namespace { struct AAPointerInfoImpl : public StateWrapper<AA::PointerInfo::State, AAPointerInfo> { using BaseTy = StateWrapper<AA::PointerInfo::State, AAPointerInfo>; @@ -1207,7 +1193,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { } SmallVector<Value *, 8> Indices; - for (Use &Idx : llvm::make_range(GEP->idx_begin(), GEP->idx_end())) { + for (Use &Idx : GEP->indices()) { if (auto *CIdx = dyn_cast<ConstantInt>(Idx)) { Indices.push_back(CIdx); continue; @@ -1244,7 +1230,11 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { } // Check if the PHI operand is not dependent on the PHI itself. - APInt Offset(DL.getIndexTypeSizeInBits(AssociatedValue.getType()), 0); + // TODO: This is not great as we look at the pointer type. However, it + // is unclear where the Offset size comes from with typeless pointers. + APInt Offset( + DL.getIndexSizeInBits(CurPtr->getType()->getPointerAddressSpace()), + 0); if (&AssociatedValue == CurPtr->stripAndAccumulateConstantOffsets( DL, Offset, /* AllowNonInbounds */ true)) { if (Offset != PtrOI.Offset) { @@ -2432,6 +2422,10 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { const size_t NoUBPrevSize = AssumedNoUBInsts.size(); auto InspectMemAccessInstForUB = [&](Instruction &I) { + // Lang ref now states volatile store is not UB, let's skip them. + if (I.isVolatile() && I.mayWriteToMemory()) + return true; + // Skip instructions that are already saved. if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) return true; @@ -2511,7 +2505,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { Function *Callee = CB.getCalledFunction(); if (!Callee) return true; - for (unsigned idx = 0; idx < CB.getNumArgOperands(); idx++) { + for (unsigned idx = 0; idx < CB.arg_size(); idx++) { // If current argument is known to be simplified to null pointer and the // corresponding argument position is known to have nonnull attribute, // the argument is poison. Furthermore, if the argument is poison and @@ -3179,8 +3173,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // value passed at this call site. // TODO: AbstractCallSite const auto &CB = cast<CallBase>(getAnchorValue()); - for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands(); - OtherArgNo++) + for (unsigned OtherArgNo = 0; OtherArgNo < CB.arg_size(); OtherArgNo++) if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo)) return false; @@ -3398,6 +3391,10 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl { } bool isDeadStore(Attributor &A, StoreInst &SI) { + // Lang ref now states volatile store is not UB/dead, let's skip them. + if (SI.isVolatile()) + return false; + bool UsedAssumedInformation = false; SmallSetVector<Value *, 4> PotentialCopies; if (!AA::getPotentialCopiesOfStoredValue(A, SI, PotentialCopies, *this, @@ -5083,6 +5080,7 @@ struct AANoCaptureCallSiteReturned final : AANoCaptureImpl { STATS_DECLTRACK_CSRET_ATTR(nocapture) } }; +} // namespace /// ------------------ Value Simplify Attribute ---------------------------- @@ -5103,6 +5101,7 @@ bool ValueSimplifyStateType::unionAssumed(Optional<Value *> Other) { return true; } +namespace { struct AAValueSimplifyImpl : AAValueSimplify { AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A) : AAValueSimplify(IRP, A) {} @@ -6508,7 +6507,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) { CallBase *DC = cast<CallBase>(ACS.getInstruction()); int DCArgNo = ACS.getCallArgOperandNo(ArgNo); - assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() && + assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->arg_size() && "Expected a direct call operand for callback call operand"); LLVM_DEBUG({ @@ -7331,10 +7330,12 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U, case Instruction::Store: // Stores cause the NO_WRITES property to disappear if the use is the - // pointer operand. Note that we do assume that capturing was taken care of - // somewhere else. + // pointer operand. Note that while capturing was taken care of somewhere + // else we need to deal with stores of the value that is not looked through. if (cast<StoreInst>(UserI)->getPointerOperand() == U.get()) removeAssumedBits(NO_WRITES); + else + indicatePessimisticFixpoint(); return; case Instruction::Call: @@ -7380,6 +7381,7 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U, if (UserI->mayWriteToMemory()) removeAssumedBits(NO_WRITES); } +} // namespace /// -------------------- Memory Locations Attributes --------------------------- /// Includes read-none, argmemonly, inaccessiblememonly, @@ -7672,11 +7674,14 @@ void AAMemoryLocationImpl::categorizePtrValue( assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped."); if (isa<UndefValue>(Obj)) continue; - if (auto *Arg = dyn_cast<Argument>(Obj)) { - if (Arg->hasByValAttr()) - MLK = NO_LOCAL_MEM; - else - MLK = NO_ARGUMENT_MEM; + if (isa<Argument>(Obj)) { + // TODO: For now we do not treat byval arguments as local copies performed + // on the call edge, though, we should. To make that happen we need to + // teach various passes, e.g., DSE, about the copy effect of a byval. That + // would also allow us to mark functions only accessing byval arguments as + // readnone again, atguably their acceses have no effect outside of the + // function, like accesses to allocas. + MLK = NO_ARGUMENT_MEM; } else if (auto *GV = dyn_cast<GlobalValue>(Obj)) { // Reading constant memory is not treated as a read "effect" by the // function attr pass so we won't neither. Constants defined by TBAA are @@ -7722,7 +7727,7 @@ void AAMemoryLocationImpl::categorizePtrValue( void AAMemoryLocationImpl::categorizeArgumentPointerLocations( Attributor &A, CallBase &CB, AAMemoryLocation::StateType &AccessedLocs, bool &Changed) { - for (unsigned ArgNo = 0, E = CB.getNumArgOperands(); ArgNo < E; ++ArgNo) { + for (unsigned ArgNo = 0, E = CB.arg_size(); ArgNo < E; ++ArgNo) { // Skip non-pointer arguments. const Value *ArgOp = CB.getArgOperand(ArgNo); @@ -8655,31 +8660,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { static bool calculateICmpInst(const ICmpInst *ICI, const APInt &LHS, const APInt &RHS) { - ICmpInst::Predicate Pred = ICI->getPredicate(); - switch (Pred) { - case ICmpInst::ICMP_UGT: - return LHS.ugt(RHS); - case ICmpInst::ICMP_SGT: - return LHS.sgt(RHS); - case ICmpInst::ICMP_EQ: - return LHS.eq(RHS); - case ICmpInst::ICMP_UGE: - return LHS.uge(RHS); - case ICmpInst::ICMP_SGE: - return LHS.sge(RHS); - case ICmpInst::ICMP_ULT: - return LHS.ult(RHS); - case ICmpInst::ICMP_SLT: - return LHS.slt(RHS); - case ICmpInst::ICMP_NE: - return LHS.ne(RHS); - case ICmpInst::ICMP_ULE: - return LHS.ule(RHS); - case ICmpInst::ICMP_SLE: - return LHS.sle(RHS); - default: - llvm_unreachable("Invalid ICmp predicate!"); - } + return ICmpInst::compare(LHS, RHS, ICI->getPredicate()); } static APInt calculateCastInst(const CastInst *CI, const APInt &Src, @@ -8719,25 +8700,25 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { case Instruction::Mul: return LHS * RHS; case Instruction::UDiv: - if (RHS.isNullValue()) { + if (RHS.isZero()) { SkipOperation = true; return LHS; } return LHS.udiv(RHS); case Instruction::SDiv: - if (RHS.isNullValue()) { + if (RHS.isZero()) { SkipOperation = true; return LHS; } return LHS.sdiv(RHS); case Instruction::URem: - if (RHS.isNullValue()) { + if (RHS.isZero()) { SkipOperation = true; return LHS; } return LHS.urem(RHS); case Instruction::SRem: - if (RHS.isNullValue()) { + if (RHS.isZero()) { SkipOperation = true; return LHS; } @@ -9336,32 +9317,69 @@ struct AANoUndefCallSiteReturned final void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) } }; -struct AACallEdgesFunction : public AACallEdges { - AACallEdgesFunction(const IRPosition &IRP, Attributor &A) - : AACallEdges(IRP, A) {} +struct AACallEdgesImpl : public AACallEdges { + AACallEdgesImpl(const IRPosition &IRP, Attributor &A) : AACallEdges(IRP, A) {} + virtual const SetVector<Function *> &getOptimisticEdges() const override { + return CalledFunctions; + } + + virtual bool hasUnknownCallee() const override { return HasUnknownCallee; } + + virtual bool hasNonAsmUnknownCallee() const override { + return HasUnknownCalleeNonAsm; + } + + const std::string getAsStr() const override { + return "CallEdges[" + std::to_string(HasUnknownCallee) + "," + + std::to_string(CalledFunctions.size()) + "]"; + } + + void trackStatistics() const override {} + +protected: + void addCalledFunction(Function *Fn, ChangeStatus &Change) { + if (CalledFunctions.insert(Fn)) { + Change = ChangeStatus::CHANGED; + LLVM_DEBUG(dbgs() << "[AACallEdges] New call edge: " << Fn->getName() + << "\n"); + } + } + + void setHasUnknownCallee(bool NonAsm, ChangeStatus &Change) { + if (!HasUnknownCallee) + Change = ChangeStatus::CHANGED; + if (NonAsm && !HasUnknownCalleeNonAsm) + Change = ChangeStatus::CHANGED; + HasUnknownCalleeNonAsm |= NonAsm; + HasUnknownCallee = true; + } + +private: + /// Optimistic set of functions that might be called by this position. + SetVector<Function *> CalledFunctions; + + /// Is there any call with a unknown callee. + bool HasUnknownCallee = false; + + /// Is there any call with a unknown callee, excluding any inline asm. + bool HasUnknownCalleeNonAsm = false; +}; + +struct AACallEdgesCallSite : public AACallEdgesImpl { + AACallEdgesCallSite(const IRPosition &IRP, Attributor &A) + : AACallEdgesImpl(IRP, A) {} /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Change = ChangeStatus::UNCHANGED; - bool OldHasUnknownCallee = HasUnknownCallee; - bool OldHasUnknownCalleeNonAsm = HasUnknownCalleeNonAsm; - - auto AddCalledFunction = [&](Function *Fn) { - if (CalledFunctions.insert(Fn)) { - Change = ChangeStatus::CHANGED; - LLVM_DEBUG(dbgs() << "[AACallEdges] New call edge: " << Fn->getName() - << "\n"); - } - }; auto VisitValue = [&](Value &V, const Instruction *CtxI, bool &HasUnknown, bool Stripped) -> bool { if (Function *Fn = dyn_cast<Function>(&V)) { - AddCalledFunction(Fn); + addCalledFunction(Fn, Change); } else { LLVM_DEBUG(dbgs() << "[AACallEdges] Unrecognized value: " << V << "\n"); - HasUnknown = true; - HasUnknownCalleeNonAsm = true; + setHasUnknownCallee(true, Change); } // Explore all values. @@ -9369,44 +9387,67 @@ struct AACallEdgesFunction : public AACallEdges { }; // Process any value that we might call. - auto ProcessCalledOperand = [&](Value *V, Instruction *Ctx) { + auto ProcessCalledOperand = [&](Value *V) { + bool DummyValue = false; if (!genericValueTraversal<bool>(A, IRPosition::value(*V), *this, - HasUnknownCallee, VisitValue, nullptr, + DummyValue, VisitValue, nullptr, false)) { // If we haven't gone through all values, assume that there are unknown // callees. - HasUnknownCallee = true; - HasUnknownCalleeNonAsm = true; + setHasUnknownCallee(true, Change); } }; - auto ProcessCallInst = [&](Instruction &Inst) { - CallBase &CB = static_cast<CallBase &>(Inst); - if (CB.isInlineAsm()) { - HasUnknownCallee = true; - return true; - } + CallBase *CB = static_cast<CallBase *>(getCtxI()); - // Process callee metadata if available. - if (auto *MD = Inst.getMetadata(LLVMContext::MD_callees)) { - for (auto &Op : MD->operands()) { - Function *Callee = mdconst::extract_or_null<Function>(Op); - if (Callee) - AddCalledFunction(Callee); - } - // Callees metadata grantees that the called function is one of its - // operands, So we are done. - return true; + if (CB->isInlineAsm()) { + setHasUnknownCallee(false, Change); + return Change; + } + + // Process callee metadata if available. + if (auto *MD = getCtxI()->getMetadata(LLVMContext::MD_callees)) { + for (auto &Op : MD->operands()) { + Function *Callee = mdconst::dyn_extract_or_null<Function>(Op); + if (Callee) + addCalledFunction(Callee, Change); } + return Change; + } - // The most simple case. - ProcessCalledOperand(CB.getCalledOperand(), &Inst); + // The most simple case. + ProcessCalledOperand(CB->getCalledOperand()); - // Process callback functions. - SmallVector<const Use *, 4u> CallbackUses; - AbstractCallSite::getCallbackUses(CB, CallbackUses); - for (const Use *U : CallbackUses) - ProcessCalledOperand(U->get(), &Inst); + // Process callback functions. + SmallVector<const Use *, 4u> CallbackUses; + AbstractCallSite::getCallbackUses(*CB, CallbackUses); + for (const Use *U : CallbackUses) + ProcessCalledOperand(U->get()); + + return Change; + } +}; + +struct AACallEdgesFunction : public AACallEdgesImpl { + AACallEdgesFunction(const IRPosition &IRP, Attributor &A) + : AACallEdgesImpl(IRP, A) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + auto ProcessCallInst = [&](Instruction &Inst) { + CallBase &CB = static_cast<CallBase &>(Inst); + + auto &CBEdges = A.getAAFor<AACallEdges>( + *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED); + if (CBEdges.hasNonAsmUnknownCallee()) + setHasUnknownCallee(true, Change); + if (CBEdges.hasUnknownCallee()) + setHasUnknownCallee(false, Change); + + for (Function *F : CBEdges.getOptimisticEdges()) + addCalledFunction(F, Change); return true; }; @@ -9417,155 +9458,323 @@ struct AACallEdgesFunction : public AACallEdges { UsedAssumedInformation)) { // If we haven't looked at all call like instructions, assume that there // are unknown callees. - HasUnknownCallee = true; - HasUnknownCalleeNonAsm = true; + setHasUnknownCallee(true, Change); } - // Track changes. - if (OldHasUnknownCallee != HasUnknownCallee || - OldHasUnknownCalleeNonAsm != HasUnknownCalleeNonAsm) - Change = ChangeStatus::CHANGED; - return Change; } +}; - virtual const SetVector<Function *> &getOptimisticEdges() const override { - return CalledFunctions; - }; +struct AAFunctionReachabilityFunction : public AAFunctionReachability { +private: + struct QuerySet { + void markReachable(Function *Fn) { + Reachable.insert(Fn); + Unreachable.erase(Fn); + } + + ChangeStatus update(Attributor &A, const AAFunctionReachability &AA, + ArrayRef<const AACallEdges *> AAEdgesList) { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + for (auto *AAEdges : AAEdgesList) { + if (AAEdges->hasUnknownCallee()) { + if (!CanReachUnknownCallee) + Change = ChangeStatus::CHANGED; + CanReachUnknownCallee = true; + return Change; + } + } - virtual bool hasUnknownCallee() const override { return HasUnknownCallee; } + for (Function *Fn : make_early_inc_range(Unreachable)) { + if (checkIfReachable(A, AA, AAEdgesList, Fn)) { + Change = ChangeStatus::CHANGED; + markReachable(Fn); + } + } + return Change; + } - virtual bool hasNonAsmUnknownCallee() const override { - return HasUnknownCalleeNonAsm; - } + bool isReachable(Attributor &A, const AAFunctionReachability &AA, + ArrayRef<const AACallEdges *> AAEdgesList, Function *Fn) { + // Assume that we can reach the function. + // TODO: Be more specific with the unknown callee. + if (CanReachUnknownCallee) + return true; - const std::string getAsStr() const override { - return "CallEdges[" + std::to_string(HasUnknownCallee) + "," + - std::to_string(CalledFunctions.size()) + "]"; - } + if (Reachable.count(Fn)) + return true; - void trackStatistics() const override {} + if (Unreachable.count(Fn)) + return false; - /// Optimistic set of functions that might be called by this function. - SetVector<Function *> CalledFunctions; + // We need to assume that this function can't reach Fn to prevent + // an infinite loop if this function is recursive. + Unreachable.insert(Fn); - /// Is there any call with a unknown callee. - bool HasUnknownCallee = false; + bool Result = checkIfReachable(A, AA, AAEdgesList, Fn); + if (Result) + markReachable(Fn); + return Result; + } - /// Is there any call with a unknown callee, excluding any inline asm. - bool HasUnknownCalleeNonAsm = false; -}; + bool checkIfReachable(Attributor &A, const AAFunctionReachability &AA, + ArrayRef<const AACallEdges *> AAEdgesList, + Function *Fn) const { -struct AAFunctionReachabilityFunction : public AAFunctionReachability { - AAFunctionReachabilityFunction(const IRPosition &IRP, Attributor &A) - : AAFunctionReachability(IRP, A) {} + // Handle the most trivial case first. + for (auto *AAEdges : AAEdgesList) { + const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges(); - bool canReach(Attributor &A, Function *Fn) const override { - // Assume that we can reach any function if we can reach a call with - // unknown callee. - if (CanReachUnknownCallee) - return true; + if (Edges.count(Fn)) + return true; + } - if (ReachableQueries.count(Fn)) - return true; + SmallVector<const AAFunctionReachability *, 8> Deps; + for (auto &AAEdges : AAEdgesList) { + const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges(); + + for (Function *Edge : Edges) { + // We don't need a dependency if the result is reachable. + const AAFunctionReachability &EdgeReachability = + A.getAAFor<AAFunctionReachability>( + AA, IRPosition::function(*Edge), DepClassTy::NONE); + Deps.push_back(&EdgeReachability); + + if (EdgeReachability.canReach(A, Fn)) + return true; + } + } + + // The result is false for now, set dependencies and leave. + for (auto Dep : Deps) + A.recordDependence(AA, *Dep, DepClassTy::REQUIRED); - if (UnreachableQueries.count(Fn)) return false; + } + + /// Set of functions that we know for sure is reachable. + DenseSet<Function *> Reachable; + + /// Set of functions that are unreachable, but might become reachable. + DenseSet<Function *> Unreachable; + + /// If we can reach a function with a call to a unknown function we assume + /// that we can reach any function. + bool CanReachUnknownCallee = false; + }; +public: + AAFunctionReachabilityFunction(const IRPosition &IRP, Attributor &A) + : AAFunctionReachability(IRP, A) {} + + bool canReach(Attributor &A, Function *Fn) const override { const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::REQUIRED); - const SetVector<Function *> &Edges = AAEdges.getOptimisticEdges(); - bool Result = checkIfReachable(A, Edges, Fn); + // Attributor returns attributes as const, so this function has to be + // const for users of this attribute to use it without having to do + // a const_cast. + // This is a hack for us to be able to cache queries. + auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this); + bool Result = + NonConstThis->WholeFunction.isReachable(A, *this, {&AAEdges}, Fn); + + return Result; + } + + /// Can \p CB reach \p Fn + bool canReach(Attributor &A, CallBase &CB, Function *Fn) const override { + const AACallEdges &AAEdges = A.getAAFor<AACallEdges>( + *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED); // Attributor returns attributes as const, so this function has to be // const for users of this attribute to use it without having to do // a const_cast. // This is a hack for us to be able to cache queries. auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this); + QuerySet &CBQuery = NonConstThis->CBQueries[&CB]; - if (Result) - NonConstThis->ReachableQueries.insert(Fn); - else - NonConstThis->UnreachableQueries.insert(Fn); + bool Result = CBQuery.isReachable(A, *this, {&AAEdges}, Fn); return Result; } /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { - if (CanReachUnknownCallee) - return ChangeStatus::UNCHANGED; - const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::REQUIRED); - const SetVector<Function *> &Edges = AAEdges.getOptimisticEdges(); ChangeStatus Change = ChangeStatus::UNCHANGED; - if (AAEdges.hasUnknownCallee()) { - bool OldCanReachUnknown = CanReachUnknownCallee; - CanReachUnknownCallee = true; - return OldCanReachUnknown ? ChangeStatus::UNCHANGED - : ChangeStatus::CHANGED; - } + Change |= WholeFunction.update(A, *this, {&AAEdges}); - // Check if any of the unreachable functions become reachable. - for (auto Current = UnreachableQueries.begin(); - Current != UnreachableQueries.end();) { - if (!checkIfReachable(A, Edges, *Current)) { - Current++; - continue; - } - ReachableQueries.insert(*Current); - UnreachableQueries.erase(*Current++); - Change = ChangeStatus::CHANGED; + for (auto CBPair : CBQueries) { + const AACallEdges &AAEdges = A.getAAFor<AACallEdges>( + *this, IRPosition::callsite_function(*CBPair.first), + DepClassTy::REQUIRED); + + Change |= CBPair.second.update(A, *this, {&AAEdges}); } return Change; } const std::string getAsStr() const override { - size_t QueryCount = ReachableQueries.size() + UnreachableQueries.size(); + size_t QueryCount = + WholeFunction.Reachable.size() + WholeFunction.Unreachable.size(); - return "FunctionReachability [" + std::to_string(ReachableQueries.size()) + - "," + std::to_string(QueryCount) + "]"; + return "FunctionReachability [" + + std::to_string(WholeFunction.Reachable.size()) + "," + + std::to_string(QueryCount) + "]"; } void trackStatistics() const override {} private: - bool canReachUnknownCallee() const override { return CanReachUnknownCallee; } + bool canReachUnknownCallee() const override { + return WholeFunction.CanReachUnknownCallee; + } - bool checkIfReachable(Attributor &A, const SetVector<Function *> &Edges, - Function *Fn) const { - if (Edges.count(Fn)) - return true; + /// Used to answer if a the whole function can reacha a specific function. + QuerySet WholeFunction; - for (Function *Edge : Edges) { - // We don't need a dependency if the result is reachable. - const AAFunctionReachability &EdgeReachability = - A.getAAFor<AAFunctionReachability>(*this, IRPosition::function(*Edge), - DepClassTy::NONE); + /// Used to answer if a call base inside this function can reach a specific + /// function. + DenseMap<CallBase *, QuerySet> CBQueries; +}; - if (EdgeReachability.canReach(A, Fn)) - return true; - } - for (Function *Fn : Edges) - A.getAAFor<AAFunctionReachability>(*this, IRPosition::function(*Fn), - DepClassTy::REQUIRED); +/// ---------------------- Assumption Propagation ------------------------------ +struct AAAssumptionInfoImpl : public AAAssumptionInfo { + AAAssumptionInfoImpl(const IRPosition &IRP, Attributor &A, + const DenseSet<StringRef> &Known) + : AAAssumptionInfo(IRP, A, Known) {} - return false; + bool hasAssumption(const StringRef Assumption) const override { + return isValidState() && setContains(Assumption); } - /// Set of functions that we know for sure is reachable. - SmallPtrSet<Function *, 8> ReachableQueries; + /// See AbstractAttribute::getAsStr() + const std::string getAsStr() const override { + const SetContents &Known = getKnown(); + const SetContents &Assumed = getAssumed(); + + const std::string KnownStr = + llvm::join(Known.getSet().begin(), Known.getSet().end(), ","); + const std::string AssumedStr = + (Assumed.isUniversal()) + ? "Universal" + : llvm::join(Assumed.getSet().begin(), Assumed.getSet().end(), ","); + + return "Known [" + KnownStr + "]," + " Assumed [" + AssumedStr + "]"; + } +}; + +/// Propagates assumption information from parent functions to all of their +/// successors. An assumption can be propagated if the containing function +/// dominates the called function. +/// +/// We start with a "known" set of assumptions already valid for the associated +/// function and an "assumed" set that initially contains all possible +/// assumptions. The assumed set is inter-procedurally updated by narrowing its +/// contents as concrete values are known. The concrete values are seeded by the +/// first nodes that are either entries into the call graph, or contains no +/// assumptions. Each node is updated as the intersection of the assumed state +/// with all of its predecessors. +struct AAAssumptionInfoFunction final : AAAssumptionInfoImpl { + AAAssumptionInfoFunction(const IRPosition &IRP, Attributor &A) + : AAAssumptionInfoImpl(IRP, A, + getAssumptions(*IRP.getAssociatedFunction())) {} + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + const auto &Assumptions = getKnown(); + + // Don't manifest a universal set if it somehow made it here. + if (Assumptions.isUniversal()) + return ChangeStatus::UNCHANGED; + + Function *AssociatedFunction = getAssociatedFunction(); + + bool Changed = addAssumptions(*AssociatedFunction, Assumptions.getSet()); + + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + bool Changed = false; + + auto CallSitePred = [&](AbstractCallSite ACS) { + const auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>( + *this, IRPosition::callsite_function(*ACS.getInstruction()), + DepClassTy::REQUIRED); + // Get the set of assumptions shared by all of this function's callers. + Changed |= getIntersection(AssumptionAA.getAssumed()); + return !getAssumed().empty() || !getKnown().empty(); + }; + + bool AllCallSitesKnown; + // Get the intersection of all assumptions held by this node's predecessors. + // If we don't know all the call sites then this is either an entry into the + // call graph or an empty node. This node is known to only contain its own + // assumptions and can be propagated to its successors. + if (!A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) + return indicatePessimisticFixpoint(); - /// Set of functions that are unreachable, but might become reachable. - SmallPtrSet<Function *, 8> UnreachableQueries; + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + void trackStatistics() const override {} +}; + +/// Assumption Info defined for call sites. +struct AAAssumptionInfoCallSite final : AAAssumptionInfoImpl { + + AAAssumptionInfoCallSite(const IRPosition &IRP, Attributor &A) + : AAAssumptionInfoImpl(IRP, A, getInitialAssumptions(IRP)) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + const IRPosition &FnPos = IRPosition::function(*getAnchorScope()); + A.getAAFor<AAAssumptionInfo>(*this, FnPos, DepClassTy::REQUIRED); + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + // Don't manifest a universal set if it somehow made it here. + if (getKnown().isUniversal()) + return ChangeStatus::UNCHANGED; - /// If we can reach a function with a call to a unknown function we assume - /// that we can reach any function. - bool CanReachUnknownCallee = false; + CallBase &AssociatedCall = cast<CallBase>(getAssociatedValue()); + bool Changed = addAssumptions(AssociatedCall, getAssumed().getSet()); + + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + const IRPosition &FnPos = IRPosition::function(*getAnchorScope()); + auto &AssumptionAA = + A.getAAFor<AAAssumptionInfo>(*this, FnPos, DepClassTy::REQUIRED); + bool Changed = getIntersection(AssumptionAA.getAssumed()); + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + /// Helper to initialized the known set as all the assumptions this call and + /// the callee contain. + DenseSet<StringRef> getInitialAssumptions(const IRPosition &IRP) { + const CallBase &CB = cast<CallBase>(IRP.getAssociatedValue()); + auto Assumptions = getAssumptions(CB); + if (Function *F = IRP.getAssociatedFunction()) + set_union(Assumptions, getAssumptions(*F)); + if (Function *F = IRP.getAssociatedFunction()) + set_union(Assumptions, getAssumptions(*F)); + return Assumptions; + } }; } // namespace @@ -9603,6 +9812,7 @@ const char AANoUndef::ID = 0; const char AACallEdges::ID = 0; const char AAFunctionReachability::ID = 0; const char AAPointerInfo::ID = 0; +const char AAAssumptionInfo::ID = 0; // Macro magic to create the static generator function for attributes that // follow the naming scheme. @@ -9704,6 +9914,8 @@ CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AACallEdges) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAssumptionInfo) CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull) CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias) @@ -9723,7 +9935,6 @@ CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior) -CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AACallEdges) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAFunctionReachability) CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp index 8e81f4bad4af..178d3f41963e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -153,33 +153,30 @@ static bool mergeConstants(Module &M) { // were just merged. while (true) { // Find the canonical constants others will be merged with. - for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); - GVI != E; ) { - GlobalVariable *GV = &*GVI++; - + for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { // If this GV is dead, remove it. - GV->removeDeadConstantUsers(); - if (GV->use_empty() && GV->hasLocalLinkage()) { - GV->eraseFromParent(); + GV.removeDeadConstantUsers(); + if (GV.use_empty() && GV.hasLocalLinkage()) { + GV.eraseFromParent(); ++ChangesMade; continue; } - if (isUnmergeableGlobal(GV, UsedGlobals)) + if (isUnmergeableGlobal(&GV, UsedGlobals)) continue; // This transformation is legal for weak ODR globals in the sense it // doesn't change semantics, but we really don't want to perform it // anyway; it's likely to pessimize code generation, and some tools // (like the Darwin linker in cases involving CFString) don't expect it. - if (GV->isWeakForLinker()) + if (GV.isWeakForLinker()) continue; // Don't touch globals with metadata other then !dbg. - if (hasMetadataOtherThanDebugLoc(GV)) + if (hasMetadataOtherThanDebugLoc(&GV)) continue; - Constant *Init = GV->getInitializer(); + Constant *Init = GV.getInitializer(); // Check to see if the initializer is already known. GlobalVariable *&Slot = CMap[Init]; @@ -188,9 +185,9 @@ static bool mergeConstants(Module &M) { // replace with the current one. If the current is externally visible // it cannot be replace, but can be the canonical constant we merge with. bool FirstConstantFound = !Slot; - if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) { - Slot = GV; - LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName() + if (FirstConstantFound || IsBetterCanonical(GV, *Slot)) { + Slot = &GV; + LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV.getName() << (FirstConstantFound ? "\n" : " (updated)\n")); } } @@ -199,18 +196,15 @@ static bool mergeConstants(Module &M) { // SameContentReplacements vector. We cannot do the replacement in this pass // because doing so may cause initializers of other globals to be rewritten, // invalidating the Constant* pointers in CMap. - for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); - GVI != E; ) { - GlobalVariable *GV = &*GVI++; - - if (isUnmergeableGlobal(GV, UsedGlobals)) + for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { + if (isUnmergeableGlobal(&GV, UsedGlobals)) continue; // We can only replace constant with local linkage. - if (!GV->hasLocalLinkage()) + if (!GV.hasLocalLinkage()) continue; - Constant *Init = GV->getInitializer(); + Constant *Init = GV.getInitializer(); // Check to see if the initializer is already known. auto Found = CMap.find(Init); @@ -218,16 +212,16 @@ static bool mergeConstants(Module &M) { continue; GlobalVariable *Slot = Found->second; - if (Slot == GV) + if (Slot == &GV) continue; - if (makeMergeable(GV, Slot) == CanMerge::No) + if (makeMergeable(&GV, Slot) == CanMerge::No) continue; // Make all uses of the duplicate constant use the canonical version. - LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @" + LLVM_DEBUG(dbgs() << "Will replace: @" << GV.getName() << " -> @" << Slot->getName() << "\n"); - SameContentReplacements.push_back(std::make_pair(GV, Slot)); + SameContentReplacements.push_back(std::make_pair(&GV, Slot)); } // Now that we have figured out which replacements must be made, do them all diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d95fd55870f8..fb9ab7954e36 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -175,8 +175,8 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { // to pass in a smaller number of arguments into the new function. // std::vector<Value *> Args; - for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) { - CallBase *CB = dyn_cast<CallBase>(*I++); + for (User *U : llvm::make_early_inc_range(Fn.users())) { + CallBase *CB = dyn_cast<CallBase>(U); if (!CB) continue; @@ -188,9 +188,9 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { if (!PAL.isEmpty()) { SmallVector<AttributeSet, 8> ArgAttrs; for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo) - ArgAttrs.push_back(PAL.getParamAttributes(ArgNo)); - PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(), - PAL.getRetAttributes(), ArgAttrs); + ArgAttrs.push_back(PAL.getParamAttrs(ArgNo)); + PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttrs(), + PAL.getRetAttrs(), ArgAttrs); } SmallVector<OperandBundleDef, 1> OpBundles; @@ -762,8 +762,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (LiveValues.erase(Arg)) { Params.push_back(I->getType()); ArgAlive[ArgI] = true; - ArgAttrVec.push_back(PAL.getParamAttributes(ArgI)); - HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned); + ArgAttrVec.push_back(PAL.getParamAttrs(ArgI)); + HasLiveReturnedArg |= PAL.hasParamAttr(ArgI, Attribute::Returned); } else { ++NumArgumentsEliminated; LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " @@ -838,7 +838,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { assert(NRetTy && "No new return type found?"); // The existing function return attributes. - AttrBuilder RAttrs(PAL.getRetAttributes()); + AttrBuilder RAttrs(PAL.getRetAttrs()); // Remove any incompatible attributes, but only if we removed all return // values. Otherwise, ensure that we don't have any conflicting attributes @@ -853,8 +853,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Strip allocsize attributes. They might refer to the deleted arguments. - AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute( - F->getContext(), Attribute::AllocSize); + AttributeSet FnAttrs = + PAL.getFnAttrs().removeAttribute(F->getContext(), Attribute::AllocSize); // Reconstruct the AttributesList based on the vector we constructed. assert(ArgAttrVec.size() == Params.size()); @@ -889,7 +889,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Adjust the call return attributes in case the function was changed to // return void. - AttrBuilder RAttrs(CallPAL.getRetAttributes()); + AttrBuilder RAttrs(CallPAL.getRetAttrs()); RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); @@ -903,7 +903,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[Pi]) { Args.push_back(*I); // Get original parameter attributes, but skip return attributes. - AttributeSet Attrs = CallPAL.getParamAttributes(Pi); + AttributeSet Attrs = CallPAL.getParamAttrs(Pi); if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) { // If the return type has changed, then get rid of 'returned' on the // call site. The alternative is to make all 'returned' attributes on @@ -922,7 +922,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Push any varargs arguments on the list. Don't forget their attributes. for (auto E = CB.arg_end(); I != E; ++I, ++Pi) { Args.push_back(*I); - ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi)); + ArgAttrVec.push_back(CallPAL.getParamAttrs(Pi)); } // Reconstruct the AttributesList based on the vector we constructed. @@ -930,7 +930,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Again, be sure to remove any allocsize attributes, since their indices // may now be incorrect. - AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute( + AttributeSet FnAttrs = CallPAL.getFnAttrs().removeAttribute( F->getContext(), Attribute::AllocSize); AttributeList NewCallPAL = AttributeList::get( @@ -1094,11 +1094,9 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M, // fused with the next loop, because deleting a function invalidates // information computed while surveying other functions. LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n"); - for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { - Function &F = *I++; + for (Function &F : llvm::make_early_inc_range(M)) if (F.getFunctionType()->isVarArg()) Changed |= DeleteDeadVarargs(F); - } // Second phase:loop through the module, determining which arguments are live. // We assume all arguments are dead unless proven otherwise (allowing us to @@ -1109,13 +1107,10 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M, SurveyFunction(F); // Now, remove all dead arguments and return values from each function in - // turn. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { - // Increment now, because the function will probably get removed (ie. - // replaced by a new one). - Function *F = &*I++; - Changed |= RemoveDeadStuffFromFunction(F); - } + // turn. We use make_early_inc_range here because functions will probably get + // removed (i.e. replaced by new ones). + for (Function &F : llvm::make_early_inc_range(M)) + Changed |= RemoveDeadStuffFromFunction(&F); // Finally, look for any unused parameters in functions with non-local // linkage and replace the passed in parameters with undef. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp index ba0efd46af16..387f114f6ffa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -121,32 +121,27 @@ namespace { } // Visit the Aliases. - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E;) { - Module::alias_iterator CurI = I; - ++I; - - bool Delete = deleteStuff == (bool)Named.count(&*CurI); - makeVisible(*CurI, Delete); + for (GlobalAlias &GA : llvm::make_early_inc_range(M.aliases())) { + bool Delete = deleteStuff == (bool)Named.count(&GA); + makeVisible(GA, Delete); if (Delete) { - Type *Ty = CurI->getValueType(); + Type *Ty = GA.getValueType(); - CurI->removeFromParent(); + GA.removeFromParent(); llvm::Value *Declaration; if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) { - Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage, - CurI->getAddressSpace(), - CurI->getName(), &M); + Declaration = + Function::Create(FTy, GlobalValue::ExternalLinkage, + GA.getAddressSpace(), GA.getName(), &M); } else { Declaration = - new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage, - nullptr, CurI->getName()); - + new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage, + nullptr, GA.getName()); } - CurI->replaceAllUsesWith(Declaration); - delete &*CurI; + GA.replaceAllUsesWith(Declaration); + delete &GA; } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp index 47fdf042f9d4..16d00a0c89e1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -50,14 +50,14 @@ static void forceAttributes(Function &F) { return Kind; }; - for (auto &S : ForceAttributes) { + for (const auto &S : ForceAttributes) { auto Kind = ParseFunctionAndAttr(S); if (Kind == Attribute::None || F.hasFnAttribute(Kind)) continue; F.addFnAttr(Kind); } - for (auto &S : ForceRemoveAttributes) { + for (const auto &S : ForceRemoveAttributes) { auto Kind = ParseFunctionAndAttr(S); if (Kind == Attribute::None || !F.hasFnAttribute(Kind)) continue; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index ca8660a98ded..cde78713b554 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -14,10 +14,12 @@ #include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" @@ -82,6 +84,11 @@ STATISTIC(NumNoFree, "Number of functions marked as nofree"); STATISTIC(NumWillReturn, "Number of functions marked as willreturn"); STATISTIC(NumNoSync, "Number of functions marked as nosync"); +STATISTIC(NumThinLinkNoRecurse, + "Number of functions marked as norecurse during thinlink"); +STATISTIC(NumThinLinkNoUnwind, + "Number of functions marked as nounwind during thinlink"); + static cl::opt<bool> EnableNonnullArgPropagation( "enable-nonnull-arg-prop", cl::init(true), cl::Hidden, cl::desc("Try to propagate nonnull argument attributes from callsites to " @@ -95,6 +102,10 @@ static cl::opt<bool> DisableNoFreeInference( "disable-nofree-inference", cl::Hidden, cl::desc("Stop inferring nofree attribute during function-attrs pass")); +static cl::opt<bool> DisableThinLTOPropagation( + "disable-thinlto-funcattrs", cl::init(true), cl::Hidden, + cl::desc("Don't propagate function-attrs in thinLTO")); + namespace { using SCCNodeSet = SmallSetVector<Function *, 8>; @@ -131,12 +142,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, // Scan the function body for instructions that may read or write memory. bool ReadsMemory = false; bool WritesMemory = false; - for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { - Instruction *I = &*II; - + for (Instruction &I : instructions(F)) { // Some instructions can be ignored even if they read or write memory. // Detect these now, skipping to the next instruction if one is found. - if (auto *Call = dyn_cast<CallBase>(I)) { + if (auto *Call = dyn_cast<CallBase>(&I)) { // Ignore calls to functions in the same SCC, as long as the call sites // don't have operand bundles. Calls with operand bundles are allowed to // have memory effects not described by the memory effects of the call @@ -170,14 +179,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, // Check whether all pointer arguments point to local memory, and // ignore calls that only access local memory. - for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) { - Value *Arg = *CI; + for (const Use &U : Call->args()) { + const Value *Arg = U; if (!Arg->getType()->isPtrOrPtrVectorTy()) continue; - AAMDNodes AAInfo; - I->getAAMetadata(AAInfo); - MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(Arg, AAInfo); + MemoryLocation Loc = + MemoryLocation::getBeforeOrAfter(Arg, I.getAAMetadata()); // Skip accesses to local or constant memory as they don't impact the // externally visible mod/ref behavior. @@ -192,21 +200,21 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, ReadsMemory = true; } continue; - } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + } else if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { // Ignore non-volatile loads from local memory. (Atomic is okay here.) if (!LI->isVolatile()) { MemoryLocation Loc = MemoryLocation::get(LI); if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) continue; } - } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { // Ignore non-volatile stores to local memory. (Atomic is okay here.) if (!SI->isVolatile()) { MemoryLocation Loc = MemoryLocation::get(SI); if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) continue; } - } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { + } else if (VAArgInst *VI = dyn_cast<VAArgInst>(&I)) { // Ignore vaargs on local memory. MemoryLocation Loc = MemoryLocation::get(VI); if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) @@ -217,10 +225,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, // read or write memory. // // Writes memory, remember that. - WritesMemory |= I->mayWriteToMemory(); + WritesMemory |= I.mayWriteToMemory(); // If this instruction may read memory, remember that. - ReadsMemory |= I->mayReadFromMemory(); + ReadsMemory |= I.mayReadFromMemory(); } if (WritesMemory) { @@ -240,7 +248,8 @@ MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F, /// Deduce readonly/readnone attributes for the SCC. template <typename AARGetterT> -static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { +static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter, + SmallSet<Function *, 8> &Changed) { // Check if any of the functions in the SCC read or write memory. If they // write memory then they can't be marked readnone or readonly. bool ReadsMemory = false; @@ -255,7 +264,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(), AAR, SCCNodes)) { case MAK_MayWrite: - return false; + return; case MAK_ReadOnly: ReadsMemory = true; break; @@ -271,11 +280,10 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { // If the SCC contains both functions that read and functions that write, then // we cannot add readonly attributes. if (ReadsMemory && WritesMemory) - return false; + return; // Success! Functions in this SCC do not access memory, or only read memory. // Give them the appropriate attribute. - bool MadeChange = false; for (Function *F : SCCNodes) { if (F->doesNotAccessMemory()) @@ -289,7 +297,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { if (F->doesNotReadMemory() && WritesMemory) continue; - MadeChange = true; + Changed.insert(F); // Clear out any existing attributes. AttrBuilder AttrsToRemove; @@ -303,7 +311,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly); AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); } - F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove); + F->removeFnAttrs(AttrsToRemove); // Add in the new attribute. if (WritesMemory && !ReadsMemory) @@ -318,8 +326,195 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { else ++NumReadNone; } +} + +// Compute definitive function attributes for a function taking into account +// prevailing definitions and linkage types +static FunctionSummary *calculatePrevailingSummary( + ValueInfo VI, + DenseMap<ValueInfo, FunctionSummary *> &CachedPrevailingSummary, + function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> + IsPrevailing) { + + if (CachedPrevailingSummary.count(VI)) + return CachedPrevailingSummary[VI]; + + /// At this point, prevailing symbols have been resolved. The following leads + /// to returning a conservative result: + /// - Multiple instances with local linkage. Normally local linkage would be + /// unique per module + /// as the GUID includes the module path. We could have a guid alias if + /// there wasn't any distinguishing path when each file was compiled, but + /// that should be rare so we'll punt on those. + + /// These next 2 cases should not happen and will assert: + /// - Multiple instances with external linkage. This should be caught in + /// symbol resolution + /// - Non-existent FunctionSummary for Aliasee. This presents a hole in our + /// knowledge meaning we have to go conservative. + + /// Otherwise, we calculate attributes for a function as: + /// 1. If we have a local linkage, take its attributes. If there's somehow + /// multiple, bail and go conservative. + /// 2. If we have an external/WeakODR/LinkOnceODR linkage check that it is + /// prevailing, take its attributes. + /// 3. If we have a Weak/LinkOnce linkage the copies can have semantic + /// differences. However, if the prevailing copy is known it will be used + /// so take its attributes. If the prevailing copy is in a native file + /// all IR copies will be dead and propagation will go conservative. + /// 4. AvailableExternally summaries without a prevailing copy are known to + /// occur in a couple of circumstances: + /// a. An internal function gets imported due to its caller getting + /// imported, it becomes AvailableExternally but no prevailing + /// definition exists. Because it has to get imported along with its + /// caller the attributes will be captured by propagating on its + /// caller. + /// b. C++11 [temp.explicit]p10 can generate AvailableExternally + /// definitions of explicitly instanced template declarations + /// for inlining which are ultimately dropped from the TU. Since this + /// is localized to the TU the attributes will have already made it to + /// the callers. + /// These are edge cases and already captured by their callers so we + /// ignore these for now. If they become relevant to optimize in the + /// future this can be revisited. + /// 5. Otherwise, go conservative. + + CachedPrevailingSummary[VI] = nullptr; + FunctionSummary *Local = nullptr; + FunctionSummary *Prevailing = nullptr; + + for (const auto &GVS : VI.getSummaryList()) { + if (!GVS->isLive()) + continue; + + FunctionSummary *FS = dyn_cast<FunctionSummary>(GVS->getBaseObject()); + // Virtual and Unknown (e.g. indirect) calls require going conservative + if (!FS || FS->fflags().HasUnknownCall) + return nullptr; + + const auto &Linkage = GVS->linkage(); + if (GlobalValue::isLocalLinkage(Linkage)) { + if (Local) { + LLVM_DEBUG( + dbgs() + << "ThinLTO FunctionAttrs: Multiple Local Linkage, bailing on " + "function " + << VI.name() << " from " << FS->modulePath() << ". Previous module " + << Local->modulePath() << "\n"); + return nullptr; + } + Local = FS; + } else if (GlobalValue::isExternalLinkage(Linkage)) { + assert(IsPrevailing(VI.getGUID(), GVS.get())); + Prevailing = FS; + break; + } else if (GlobalValue::isWeakODRLinkage(Linkage) || + GlobalValue::isLinkOnceODRLinkage(Linkage) || + GlobalValue::isWeakAnyLinkage(Linkage) || + GlobalValue::isLinkOnceAnyLinkage(Linkage)) { + if (IsPrevailing(VI.getGUID(), GVS.get())) { + Prevailing = FS; + break; + } + } else if (GlobalValue::isAvailableExternallyLinkage(Linkage)) { + // TODO: Handle these cases if they become meaningful + continue; + } + } + + if (Local) { + assert(!Prevailing); + CachedPrevailingSummary[VI] = Local; + } else if (Prevailing) { + assert(!Local); + CachedPrevailingSummary[VI] = Prevailing; + } - return MadeChange; + return CachedPrevailingSummary[VI]; +} + +bool llvm::thinLTOPropagateFunctionAttrs( + ModuleSummaryIndex &Index, + function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> + IsPrevailing) { + // TODO: implement addNoAliasAttrs once + // there's more information about the return type in the summary + if (DisableThinLTOPropagation) + return false; + + DenseMap<ValueInfo, FunctionSummary *> CachedPrevailingSummary; + bool Changed = false; + + auto PropagateAttributes = [&](std::vector<ValueInfo> &SCCNodes) { + // Assume we can propagate unless we discover otherwise + FunctionSummary::FFlags InferredFlags; + InferredFlags.NoRecurse = (SCCNodes.size() == 1); + InferredFlags.NoUnwind = true; + + for (auto &V : SCCNodes) { + FunctionSummary *CallerSummary = + calculatePrevailingSummary(V, CachedPrevailingSummary, IsPrevailing); + + // Function summaries can fail to contain information such as declarations + if (!CallerSummary) + return; + + if (CallerSummary->fflags().MayThrow) + InferredFlags.NoUnwind = false; + + for (const auto &Callee : CallerSummary->calls()) { + FunctionSummary *CalleeSummary = calculatePrevailingSummary( + Callee.first, CachedPrevailingSummary, IsPrevailing); + + if (!CalleeSummary) + return; + + if (!CalleeSummary->fflags().NoRecurse) + InferredFlags.NoRecurse = false; + + if (!CalleeSummary->fflags().NoUnwind) + InferredFlags.NoUnwind = false; + + if (!InferredFlags.NoUnwind && !InferredFlags.NoRecurse) + break; + } + } + + if (InferredFlags.NoUnwind || InferredFlags.NoRecurse) { + Changed = true; + for (auto &V : SCCNodes) { + if (InferredFlags.NoRecurse) { + LLVM_DEBUG(dbgs() << "ThinLTO FunctionAttrs: Propagated NoRecurse to " + << V.name() << "\n"); + ++NumThinLinkNoRecurse; + } + + if (InferredFlags.NoUnwind) { + LLVM_DEBUG(dbgs() << "ThinLTO FunctionAttrs: Propagated NoUnwind to " + << V.name() << "\n"); + ++NumThinLinkNoUnwind; + } + + for (auto &S : V.getSummaryList()) { + if (auto *FS = dyn_cast<FunctionSummary>(S.get())) { + if (InferredFlags.NoRecurse) + FS->setNoRecurse(); + + if (InferredFlags.NoUnwind) + FS->setNoUnwind(); + } + } + } + } + }; + + // Call propagation functions on each SCC in the Index + for (scc_iterator<ModuleSummaryIndex *> I = scc_begin(&Index); !I.isAtEnd(); + ++I) { + std::vector<ValueInfo> Nodes(*I); + PropagateAttributes(Nodes); + } + return Changed; } namespace { @@ -395,7 +590,7 @@ struct ArgumentUsesTracker : public CaptureTracker { assert(UseIndex < CB->data_operands_size() && "Indirect function calls should have been filtered above!"); - if (UseIndex >= CB->getNumArgOperands()) { + if (UseIndex >= CB->arg_size()) { // Data operand, but not a argument operand -- must be a bundle operand assert(CB->hasOperandBundles() && "Must be!"); @@ -530,7 +725,7 @@ determinePointerReadAttrs(Argument *A, assert(UseIndex < CB.data_operands_size() && "Data operand use expected!"); - bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands(); + bool IsOperandBundleUse = UseIndex >= CB.arg_size(); if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { assert(F->isVarArg() && "More params than args in non-varargs call"); @@ -581,9 +776,8 @@ determinePointerReadAttrs(Argument *A, } /// Deduce returned attributes for the SCC. -static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { - bool Changed = false; - +static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { // Check each function in turn, determining if an argument is always returned. for (Function *F : SCCNodes) { // We can infer and propagate function attributes only when we know that the @@ -623,11 +817,9 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { auto *A = cast<Argument>(RetArg); A->addAttr(Attribute::Returned); ++NumReturned; - Changed = true; + Changed.insert(F); } } - - return Changed; } /// If a callsite has arguments that are also arguments to the parent function, @@ -693,9 +885,8 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) { } /// Deduce nocapture attributes for the SCC. -static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { - bool Changed = false; - +static void addArgumentAttrs(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { ArgumentGraph AG; // Check each function in turn, determining which pointer arguments are not @@ -707,7 +898,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (!F->hasExactDefinition()) continue; - Changed |= addArgumentAttrsFromCallsites(*F); + if (addArgumentAttrsFromCallsites(*F)) + Changed.insert(F); // Functions that are readonly (or readnone) and nounwind and don't return // a value can't capture arguments. Don't analyze them. @@ -718,7 +910,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { A->addAttr(Attribute::NoCapture); ++NumNoCapture; - Changed = true; + Changed.insert(F); } } continue; @@ -737,7 +929,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { // If it's trivially not captured, mark it nocapture now. A->addAttr(Attribute::NoCapture); ++NumNoCapture; - Changed = true; + Changed.insert(F); } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save @@ -761,7 +953,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { Self.insert(&*A); Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); if (R != Attribute::None) - Changed = addReadAttr(A, R); + if (addReadAttr(A, R)) + Changed.insert(F); } } } @@ -785,7 +978,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { Argument *A = ArgumentSCC[0]->Definition; A->addAttr(Attribute::NoCapture); ++NumNoCapture; - Changed = true; + Changed.insert(A->getParent()); } continue; } @@ -827,7 +1020,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { Argument *A = ArgumentSCC[i]->Definition; A->addAttr(Attribute::NoCapture); ++NumNoCapture; - Changed = true; + Changed.insert(A->getParent()); } // We also want to compute readonly/readnone. With a small number of false @@ -858,12 +1051,11 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (ReadAttr != Attribute::None) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - Changed = addReadAttr(A, ReadAttr); + if (addReadAttr(A, ReadAttr)) + Changed.insert(A->getParent()); } } } - - return Changed; } /// Tests whether a function is "malloc-like". @@ -934,7 +1126,8 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { } /// Deduce noalias attributes for the SCC. -static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { +static void addNoAliasAttrs(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { // Check each function in turn, determining which functions return noalias // pointers. for (Function *F : SCCNodes) { @@ -946,7 +1139,7 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { // definition we'll get at link time is *exactly* the definition we see now. // For more details, see GlobalValue::mayBeDerefined. if (!F->hasExactDefinition()) - return false; + return; // We annotate noalias return values, which are only applicable to // pointer types. @@ -954,10 +1147,9 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { continue; if (!isFunctionMallocLike(F, SCCNodes)) - return false; + return; } - bool MadeChange = false; for (Function *F : SCCNodes) { if (F->returnDoesNotAlias() || !F->getReturnType()->isPointerTy()) @@ -965,10 +1157,8 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { F->setReturnDoesNotAlias(); ++NumNoAlias; - MadeChange = true; + Changed.insert(F); } - - return MadeChange; } /// Tests whether this function is known to not return null. @@ -1044,26 +1234,24 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, } /// Deduce nonnull attributes for the SCC. -static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { +static void addNonNullAttrs(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { // Speculative that all functions in the SCC return only nonnull // pointers. We may refute this as we analyze functions. bool SCCReturnsNonNull = true; - bool MadeChange = false; - // Check each function in turn, determining which functions return nonnull // pointers. for (Function *F : SCCNodes) { // Already nonnull. - if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::NonNull)) + if (F->getAttributes().hasRetAttr(Attribute::NonNull)) continue; // We can infer and propagate function attributes only when we know that the // definition we'll get at link time is *exactly* the definition we see now. // For more details, see GlobalValue::mayBeDerefined. if (!F->hasExactDefinition()) - return false; + return; // We annotate nonnull return values, which are only applicable to // pointer types. @@ -1077,9 +1265,9 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { // which prevents us from speculating about the entire SCC LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n"); - F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + F->addRetAttr(Attribute::NonNull); ++NumNonNullReturn; - MadeChange = true; + Changed.insert(F); } continue; } @@ -1090,19 +1278,16 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { if (SCCReturnsNonNull) { for (Function *F : SCCNodes) { - if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::NonNull) || + if (F->getAttributes().hasRetAttr(Attribute::NonNull) || !F->getReturnType()->isPointerTy()) continue; LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); - F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + F->addRetAttr(Attribute::NonNull); ++NumNonNullReturn; - MadeChange = true; + Changed.insert(F); } } - - return MadeChange; } namespace { @@ -1155,12 +1340,13 @@ public: InferenceDescriptors.push_back(AttrInference); } - bool run(const SCCNodeSet &SCCNodes); + void run(const SCCNodeSet &SCCNodes, SmallSet<Function *, 8> &Changed); }; /// Perform all the requested attribute inference actions according to the /// attribute predicates stored before. -bool AttributeInferer::run(const SCCNodeSet &SCCNodes) { +void AttributeInferer::run(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors; // Go through all the functions in SCC and check corresponding attribute // assumptions for each of them. Attributes that are invalid for this SCC @@ -1169,7 +1355,7 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) { // No attributes whose assumptions are still valid - done. if (InferInSCC.empty()) - return false; + return; // Check if our attributes ever need scanning/can be scanned. llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) { @@ -1212,9 +1398,8 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) { } if (InferInSCC.empty()) - return false; + return; - bool Changed = false; for (Function *F : SCCNodes) // At this point InferInSCC contains only functions that were either: // - explicitly skipped from scan/inference, or @@ -1223,10 +1408,9 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) { for (auto &ID : InferInSCC) { if (ID.SkipFunction(*F)) continue; - Changed = true; + Changed.insert(F); ID.SetAttribute(*F); } - return Changed; } struct SCCNodesResult { @@ -1243,7 +1427,7 @@ static bool InstrBreaksNonConvergent(Instruction &I, // Breaks non-convergent assumption if CS is a convergent call to a function // not in the SCC. return CB && CB->isConvergent() && - SCCNodes.count(CB->getCalledFunction()) == 0; + !SCCNodes.contains(CB->getCalledFunction()); } /// Helper for NoUnwind inference predicate InstrBreaksAttribute. @@ -1282,7 +1466,8 @@ static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) { /// Attempt to remove convergent function attribute when possible. /// /// Returns true if any changes to function attributes were made. -static bool inferConvergent(const SCCNodeSet &SCCNodes) { +static void inferConvergent(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { AttributeInferer AI; // Request to remove the convergent attribute from all functions in the SCC @@ -1305,7 +1490,7 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) { }, /* RequiresExactDefinition= */ false}); // Perform all the requested attribute inference actions. - return AI.run(SCCNodes); + AI.run(SCCNodes, Changed); } /// Infer attributes from all functions in the SCC by scanning every @@ -1314,7 +1499,8 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) { /// - addition of NoUnwind attribute /// /// Returns true if any changes to function attributes were made. -static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) { +static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { AttributeInferer AI; if (!DisableNoUnwindInference) @@ -1363,19 +1549,20 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) { /* RequiresExactDefinition= */ true}); // Perform all the requested attribute inference actions. - return AI.run(SCCNodes); + AI.run(SCCNodes, Changed); } -static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) { +static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { // Try and identify functions that do not recurse. // If the SCC contains multiple nodes we know for sure there is recursion. if (SCCNodes.size() != 1) - return false; + return; Function *F = *SCCNodes.begin(); if (!F || !F->hasExactDefinition() || F->doesNotRecurse()) - return false; + return; // If all of the calls in F are identifiable and are to norecurse functions, F // is norecurse. This check also detects self-recursion as F is not currently @@ -1386,7 +1573,7 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) { Function *Callee = CB->getCalledFunction(); if (!Callee || Callee == F || !Callee->doesNotRecurse()) // Function calls a potentially recursive function. - return false; + return; } // Every call was to a non-recursive function other than this function, and @@ -1394,7 +1581,7 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) { // recurse. F->setDoesNotRecurse(); ++NumNoRecurse; - return true; + Changed.insert(F); } static bool instructionDoesNotReturn(Instruction &I) { @@ -1412,9 +1599,8 @@ static bool basicBlockCanReturn(BasicBlock &BB) { } // Set the noreturn function attribute if possible. -static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) { - bool Changed = false; - +static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { for (Function *F : SCCNodes) { if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || F->doesNotReturn()) @@ -1424,11 +1610,9 @@ static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) { // FIXME: this doesn't handle recursion or unreachable blocks. if (none_of(*F, basicBlockCanReturn)) { F->setDoesNotReturn(); - Changed = true; + Changed.insert(F); } } - - return Changed; } static bool functionWillReturn(const Function &F) { @@ -1461,19 +1645,16 @@ static bool functionWillReturn(const Function &F) { } // Set the willreturn function attribute if possible. -static bool addWillReturn(const SCCNodeSet &SCCNodes) { - bool Changed = false; - +static void addWillReturn(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { for (Function *F : SCCNodes) { if (!F || F->willReturn() || !functionWillReturn(*F)) continue; F->setWillReturn(); NumWillReturn++; - Changed = true; + Changed.insert(F); } - - return Changed; } // Return true if this is an atomic which has an ordering stronger than @@ -1532,7 +1713,8 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) { } // Infer the nosync attribute. -static bool addNoSyncAttr(const SCCNodeSet &SCCNodes) { +static void addNoSyncAttr(const SCCNodeSet &SCCNodes, + SmallSet<Function *, 8> &Changed) { AttributeInferer AI; AI.registerAttrInference(AttributeInferer::InferenceDescriptor{ Attribute::NoSync, @@ -1549,14 +1731,15 @@ static bool addNoSyncAttr(const SCCNodeSet &SCCNodes) { ++NumNoSync; }, /* RequiresExactDefinition= */ true}); - return AI.run(SCCNodes); + AI.run(SCCNodes, Changed); } static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) { SCCNodesResult Res; Res.HasUnknownCall = false; for (Function *F : Functions) { - if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked)) { + if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) || + F->isPresplitCoroutine()) { // Treat any function we're trying not to optimize as if it were an // indirect call and omit it from the node set used below. Res.HasUnknownCall = true; @@ -1582,32 +1765,33 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) { } template <typename AARGetterT> -static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions, - AARGetterT &&AARGetter) { +static SmallSet<Function *, 8> +deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter) { SCCNodesResult Nodes = createSCCNodeSet(Functions); - bool Changed = false; // Bail if the SCC only contains optnone functions. if (Nodes.SCCNodes.empty()) - return Changed; + return {}; + + SmallSet<Function *, 8> Changed; - Changed |= addArgumentReturnedAttrs(Nodes.SCCNodes); - Changed |= addReadAttrs(Nodes.SCCNodes, AARGetter); - Changed |= addArgumentAttrs(Nodes.SCCNodes); - Changed |= inferConvergent(Nodes.SCCNodes); - Changed |= addNoReturnAttrs(Nodes.SCCNodes); - Changed |= addWillReturn(Nodes.SCCNodes); + addArgumentReturnedAttrs(Nodes.SCCNodes, Changed); + addReadAttrs(Nodes.SCCNodes, AARGetter, Changed); + addArgumentAttrs(Nodes.SCCNodes, Changed); + inferConvergent(Nodes.SCCNodes, Changed); + addNoReturnAttrs(Nodes.SCCNodes, Changed); + addWillReturn(Nodes.SCCNodes, Changed); // If we have no external nodes participating in the SCC, we can deduce some // more precise attributes as well. if (!Nodes.HasUnknownCall) { - Changed |= addNoAliasAttrs(Nodes.SCCNodes); - Changed |= addNonNullAttrs(Nodes.SCCNodes); - Changed |= inferAttrsFromFunctionBodies(Nodes.SCCNodes); - Changed |= addNoRecurseAttrs(Nodes.SCCNodes); + addNoAliasAttrs(Nodes.SCCNodes, Changed); + addNonNullAttrs(Nodes.SCCNodes, Changed); + inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); + addNoRecurseAttrs(Nodes.SCCNodes, Changed); } - Changed |= addNoSyncAttr(Nodes.SCCNodes); + addNoSyncAttr(Nodes.SCCNodes, Changed); // Finally, infer the maximal set of attributes from the ones we've inferred // above. This is handling the cases where one attribute on a signature @@ -1615,7 +1799,8 @@ static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions, // the later is missing (or simply less sophisticated). for (Function *F : Nodes.SCCNodes) if (F) - Changed |= inferAttributesFromOthers(*F); + if (inferAttributesFromOthers(*F)) + Changed.insert(F); return Changed; } @@ -1638,14 +1823,35 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C, Functions.push_back(&N.getFunction()); } - if (deriveAttrsInPostOrder(Functions, AARGetter)) { - // We have not changed the call graph or removed/added functions. - PreservedAnalyses PA; - PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); - return PA; + auto ChangedFunctions = deriveAttrsInPostOrder(Functions, AARGetter); + if (ChangedFunctions.empty()) + return PreservedAnalyses::all(); + + // Invalidate analyses for modified functions so that we don't have to + // invalidate all analyses for all functions in this SCC. + PreservedAnalyses FuncPA; + // We haven't changed the CFG for modified functions. + FuncPA.preserveSet<CFGAnalyses>(); + for (Function *Changed : ChangedFunctions) { + FAM.invalidate(*Changed, FuncPA); + // Also invalidate any direct callers of changed functions since analyses + // may care about attributes of direct callees. For example, MemorySSA cares + // about whether or not a call's callee modifies memory and queries that + // through function attributes. + for (auto *U : Changed->users()) { + if (auto *Call = dyn_cast<CallBase>(U)) { + if (Call->getCalledFunction() == Changed) + FAM.invalidate(*Call->getFunction(), FuncPA); + } + } } - return PreservedAnalyses::all(); + PreservedAnalyses PA; + // We have not added or removed functions. + PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); + // We already invalidated all relevant function analyses above. + PA.preserveSet<AllAnalysesOn<Function>>(); + return PA; } namespace { @@ -1690,7 +1896,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) { Functions.push_back(I->getFunction()); } - return deriveAttrsInPostOrder(Functions, AARGetter); + return !deriveAttrsInPostOrder(Functions, AARGetter).empty(); } bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp index 2f6cf0ca7087..d9b43109f629 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -39,6 +39,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" @@ -187,23 +188,6 @@ selectCallee(const ModuleSummaryIndex &Index, return false; } - // For SamplePGO, in computeImportForFunction the OriginalId - // may have been used to locate the callee summary list (See - // comment there). - // The mapping from OriginalId to GUID may return a GUID - // that corresponds to a static variable. Filter it out here. - // This can happen when - // 1) There is a call to a library function which is not defined - // in the index. - // 2) There is a static variable with the OriginalGUID identical - // to the GUID of the library function in 1); - // When this happens, the logic for SamplePGO kicks in and - // the static variable in 2) will be found, which needs to be - // filtered out. - if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) { - Reason = FunctionImporter::ImportFailureReason::GlobalVar; - return false; - } if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) { Reason = FunctionImporter::ImportFailureReason::InterposableLinkage; // There is no point in importing these, we can't inline them @@ -264,21 +248,6 @@ using EdgeInfo = } // anonymous namespace -static ValueInfo -updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) { - if (!VI.getSummaryList().empty()) - return VI; - // For SamplePGO, the indirect call targets for local functions will - // have its original name annotated in profile. We try to find the - // corresponding PGOFuncName as the GUID. - // FIXME: Consider updating the edges in the graph after building - // it, rather than needing to perform this mapping on each walk. - auto GUID = Index.getGUIDFromOriginalID(VI.getGUID()); - if (GUID == 0) - return ValueInfo(); - return Index.getValueInfo(GUID); -} - static bool shouldImportGlobal(const ValueInfo &VI, const GVSummaryMapTy &DefinedGVSummaries) { const auto &GVS = DefinedGVSummaries.find(VI.getGUID()); @@ -400,10 +369,6 @@ static void computeImportForFunction( continue; } - VI = updateValueInfoForIndirectCalls(Index, VI); - if (!VI) - continue; - if (DefinedGVSummaries.count(VI.getGUID())) { // FIXME: Consider not skipping import if the module contains // a non-prevailing def with interposable linkage. The prevailing copy @@ -496,7 +461,7 @@ static void computeImportForFunction( VI.name().str() + " due to " + getFailureName(Reason); auto Error = make_error<StringError>( - Msg, std::make_error_code(std::errc::operation_not_supported)); + Msg, make_error_code(errc::not_supported)); logAllUnhandledErrors(std::move(Error), errs(), "Error importing module: "); break; @@ -839,16 +804,61 @@ void llvm::ComputeCrossModuleImportForModuleFromIndex( #endif } -void llvm::computeDeadSymbols( +// For SamplePGO, the indirect call targets for local functions will +// have its original name annotated in profile. We try to find the +// corresponding PGOFuncName as the GUID, and fix up the edges +// accordingly. +void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index, + FunctionSummary *FS) { + for (auto &EI : FS->mutableCalls()) { + if (!EI.first.getSummaryList().empty()) + continue; + auto GUID = Index.getGUIDFromOriginalID(EI.first.getGUID()); + if (GUID == 0) + continue; + // Update the edge to point directly to the correct GUID. + auto VI = Index.getValueInfo(GUID); + if (llvm::any_of( + VI.getSummaryList(), + [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) { + // The mapping from OriginalId to GUID may return a GUID + // that corresponds to a static variable. Filter it out here. + // This can happen when + // 1) There is a call to a library function which is not defined + // in the index. + // 2) There is a static variable with the OriginalGUID identical + // to the GUID of the library function in 1); + // When this happens the static variable in 2) will be found, + // which needs to be filtered out. + return SummaryPtr->getSummaryKind() == + GlobalValueSummary::GlobalVarKind; + })) + continue; + EI.first = VI; + } +} + +void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) { + for (const auto &Entry : Index) { + for (auto &S : Entry.second.SummaryList) { + if (auto *FS = dyn_cast<FunctionSummary>(S.get())) + updateValueInfoForIndirectCalls(Index, FS); + } + } +} + +void llvm::computeDeadSymbolsAndUpdateIndirectCalls( ModuleSummaryIndex &Index, const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols, function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) { assert(!Index.withGlobalValueDeadStripping()); - if (!ComputeDead) - return; - if (GUIDPreservedSymbols.empty()) - // Don't do anything when nothing is live, this is friendly with tests. + if (!ComputeDead || + // Don't do anything when nothing is live, this is friendly with tests. + GUIDPreservedSymbols.empty()) { + // Still need to update indirect calls. + updateIndirectCalls(Index); return; + } unsigned LiveSymbols = 0; SmallVector<ValueInfo, 128> Worklist; Worklist.reserve(GUIDPreservedSymbols.size() * 2); @@ -863,13 +873,16 @@ void llvm::computeDeadSymbols( // Add values flagged in the index as live roots to the worklist. for (const auto &Entry : Index) { auto VI = Index.getValueInfo(Entry); - for (auto &S : Entry.second.SummaryList) + for (auto &S : Entry.second.SummaryList) { + if (auto *FS = dyn_cast<FunctionSummary>(S.get())) + updateValueInfoForIndirectCalls(Index, FS); if (S->isLive()) { LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n"); Worklist.push_back(VI); ++LiveSymbols; break; } + } } // Make value live and add it to the worklist if it was not live before. @@ -882,9 +895,6 @@ void llvm::computeDeadSymbols( // binary, which increases the binary size unnecessarily. Note that // if this code changes, the importer needs to change so that edges // to functions marked dead are skipped. - VI = updateValueInfoForIndirectCalls(Index, VI); - if (!VI) - return; if (llvm::any_of(VI.getSummaryList(), [](const std::unique_ptr<llvm::GlobalValueSummary> &S) { @@ -958,7 +968,8 @@ void llvm::computeDeadSymbolsWithConstProp( const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols, function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing, bool ImportEnabled) { - computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing); + computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols, + isPrevailing); if (ImportEnabled) Index.propagateAttributes(GUIDPreservedSymbols); } @@ -1040,13 +1051,33 @@ bool llvm::convertToDeclaration(GlobalValue &GV) { return true; } -void llvm::thinLTOResolvePrevailingInModule( - Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { - auto updateLinkage = [&](GlobalValue &GV) { +void llvm::thinLTOFinalizeInModule(Module &TheModule, + const GVSummaryMapTy &DefinedGlobals, + bool PropagateAttrs) { + auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) { // See if the global summary analysis computed a new resolved linkage. const auto &GS = DefinedGlobals.find(GV.getGUID()); if (GS == DefinedGlobals.end()) return; + + if (Propagate) + if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GS->second)) { + if (Function *F = dyn_cast<Function>(&GV)) { + // TODO: propagate ReadNone and ReadOnly. + if (FS->fflags().ReadNone && !F->doesNotAccessMemory()) + F->setDoesNotAccessMemory(); + + if (FS->fflags().ReadOnly && !F->onlyReadsMemory()) + F->setOnlyReadsMemory(); + + if (FS->fflags().NoRecurse && !F->doesNotRecurse()) + F->setDoesNotRecurse(); + + if (FS->fflags().NoUnwind && !F->doesNotThrow()) + F->setDoesNotThrow(); + } + } + auto NewLinkage = GS->second->linkage(); if (GlobalValue::isLocalLinkage(GV.getLinkage()) || // Don't internalize anything here, because the code below @@ -1105,11 +1136,11 @@ void llvm::thinLTOResolvePrevailingInModule( // Process functions and global now for (auto &GV : TheModule) - updateLinkage(GV); + FinalizeInModule(GV, PropagateAttrs); for (auto &GV : TheModule.globals()) - updateLinkage(GV); + FinalizeInModule(GV); for (auto &GV : TheModule.aliases()) - updateLinkage(GV); + FinalizeInModule(GV); } /// Run internalization on \p TheModule based on symmary analysis. @@ -1153,7 +1184,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, /// Make alias a clone of its aliasee. static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) { - Function *Fn = cast<Function>(GA->getBaseObject()); + Function *Fn = cast<Function>(GA->getAliaseeObject()); ValueToValueMapTy VMap; Function *NewFn = CloneFunction(Fn, VMap); @@ -1259,12 +1290,12 @@ Expected<bool> FunctionImporter::importFunctions( if (Error Err = GA.materialize()) return std::move(Err); // Import alias as a copy of its aliasee. - GlobalObject *Base = GA.getBaseObject(); - if (Error Err = Base->materialize()) + GlobalObject *GO = GA.getAliaseeObject(); + if (Error Err = GO->materialize()) return std::move(Err); auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); - LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID() - << " " << Base->getName() << " from " + LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " " + << GO->getName() << " from " << SrcModule->getSourceFileName() << "\n"); if (EnableImportMetadata) { // Add 'thinlto_src_module' metadata for statistics and debugging. @@ -1303,7 +1334,7 @@ Expected<bool> FunctionImporter::importFunctions( std::move(SrcModule), GlobalsToImport.getArrayRef(), [](GlobalValue &, IRMover::ValueAdder) {}, /*IsPerformingImport=*/true)) - report_fatal_error("Function Import: link error: " + + report_fatal_error(Twine("Function Import: link error: ") + toString(std::move(Err))); ImportedCount += GlobalsToImport.size(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index f61f4312b777..fbd083bb9bbf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -11,7 +11,6 @@ // are propagated to the callee by specializing the function. // // Current limitations: -// - It does not handle specialization of recursive functions, // - It does not yet handle integer ranges. // - Only 1 argument per function is specialised, // - The cost-model could be further looked into, @@ -22,6 +21,18 @@ // a direct way to steer function specialization, avoiding the cost-model, // and thus control compile-times / code-size. // +// Todos: +// - Specializing recursive functions relies on running the transformation a +// number of times, which is controlled by option +// `func-specialization-max-iters`. Thus, increasing this value and the +// number of iterations, will linearly increase the number of times recursive +// functions get specialized, see also the discussion in +// https://reviews.llvm.org/D106426 for details. Perhaps there is a +// compile-time friendlier way to control/limit the number of specialisations +// for recursive functions. +// - Don't transform the function if there is no function specialization +// happens. +// //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" @@ -59,20 +70,166 @@ static cl::opt<unsigned> MaxConstantsThreshold( "specialization"), cl::init(3)); +static cl::opt<unsigned> SmallFunctionThreshold( + "func-specialization-size-threshold", cl::Hidden, + cl::desc("Don't specialize functions that have less than this theshold " + "number of instructions"), + cl::init(100)); + static cl::opt<unsigned> AvgLoopIterationCount("func-specialization-avg-iters-cost", cl::Hidden, cl::desc("Average loop iteration count cost"), cl::init(10)); +static cl::opt<bool> SpecializeOnAddresses( + "func-specialization-on-address", cl::init(false), cl::Hidden, + cl::desc("Enable function specialization on the address of global values")); + +// TODO: This needs checking to see the impact on compile-times, which is why +// this is off by default for now. static cl::opt<bool> EnableSpecializationForLiteralConstant( "function-specialization-for-literal-constant", cl::init(false), cl::Hidden, - cl::desc("Make function specialization available for literal constant.")); + cl::desc("Enable specialization of functions that take a literal constant " + "as an argument.")); + +// Helper to check if \p LV is either a constant or a constant +// range with a single element. This should cover exactly the same cases as the +// old ValueLatticeElement::isConstant() and is intended to be used in the +// transition to ValueLatticeElement. +static bool isConstant(const ValueLatticeElement &LV) { + return LV.isConstant() || + (LV.isConstantRange() && LV.getConstantRange().isSingleElement()); +} // Helper to check if \p LV is either overdefined or a constant int. static bool isOverdefined(const ValueLatticeElement &LV) { - return !LV.isUnknownOrUndef() && !LV.isConstant(); + return !LV.isUnknownOrUndef() && !isConstant(LV); +} + +static Constant *getPromotableAlloca(AllocaInst *Alloca, CallInst *Call) { + Value *StoreValue = nullptr; + for (auto *User : Alloca->users()) { + // We can't use llvm::isAllocaPromotable() as that would fail because of + // the usage in the CallInst, which is what we check here. + if (User == Call) + continue; + if (auto *Bitcast = dyn_cast<BitCastInst>(User)) { + if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call) + return nullptr; + continue; + } + + if (auto *Store = dyn_cast<StoreInst>(User)) { + // This is a duplicate store, bail out. + if (StoreValue || Store->isVolatile()) + return nullptr; + StoreValue = Store->getValueOperand(); + continue; + } + // Bail if there is any other unknown usage. + return nullptr; + } + return dyn_cast_or_null<Constant>(StoreValue); } +// A constant stack value is an AllocaInst that has a single constant +// value stored to it. Return this constant if such an alloca stack value +// is a function argument. +static Constant *getConstantStackValue(CallInst *Call, Value *Val, + SCCPSolver &Solver) { + if (!Val) + return nullptr; + Val = Val->stripPointerCasts(); + if (auto *ConstVal = dyn_cast<ConstantInt>(Val)) + return ConstVal; + auto *Alloca = dyn_cast<AllocaInst>(Val); + if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy()) + return nullptr; + return getPromotableAlloca(Alloca, Call); +} + +// To support specializing recursive functions, it is important to propagate +// constant arguments because after a first iteration of specialisation, a +// reduced example may look like this: +// +// define internal void @RecursiveFn(i32* arg1) { +// %temp = alloca i32, align 4 +// store i32 2 i32* %temp, align 4 +// call void @RecursiveFn.1(i32* nonnull %temp) +// ret void +// } +// +// Before a next iteration, we need to propagate the constant like so +// which allows further specialization in next iterations. +// +// @funcspec.arg = internal constant i32 2 +// +// define internal void @someFunc(i32* arg1) { +// call void @otherFunc(i32* nonnull @funcspec.arg) +// ret void +// } +// +static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList, + Module &M, SCCPSolver &Solver) { + // Iterate over the argument tracked functions see if there + // are any new constant values for the call instruction via + // stack variables. + for (auto *F : WorkList) { + // TODO: Generalize for any read only arguments. + if (F->arg_size() != 1) + continue; + + auto &Arg = *F->arg_begin(); + if (!Arg.onlyReadsMemory() || !Arg.getType()->isPointerTy()) + continue; + + for (auto *User : F->users()) { + auto *Call = dyn_cast<CallInst>(User); + if (!Call) + break; + auto *ArgOp = Call->getArgOperand(0); + auto *ArgOpType = ArgOp->getType(); + auto *ConstVal = getConstantStackValue(Call, ArgOp, Solver); + if (!ConstVal) + break; + + Value *GV = new GlobalVariable(M, ConstVal->getType(), true, + GlobalValue::InternalLinkage, ConstVal, + "funcspec.arg"); + + if (ArgOpType != ConstVal->getType()) + GV = ConstantExpr::getBitCast(cast<Constant>(GV), ArgOp->getType()); + + Call->setArgOperand(0, GV); + + // Add the changed CallInst to Solver Worklist + Solver.visitCall(*Call); + } + } +} + +// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics +// interfere with the constantArgPropagation optimization. +static void removeSSACopy(Function &F) { + for (BasicBlock &BB : F) { + for (Instruction &Inst : llvm::make_early_inc_range(BB)) { + auto *II = dyn_cast<IntrinsicInst>(&Inst); + if (!II) + continue; + if (II->getIntrinsicID() != Intrinsic::ssa_copy) + continue; + Inst.replaceAllUsesWith(II->getOperand(0)); + Inst.eraseFromParent(); + } + } +} + +static void removeSSACopy(Module &M) { + for (Function &F : M) + removeSSACopy(F); +} + +namespace { class FunctionSpecializer { /// The IPSCCP Solver. @@ -115,9 +272,14 @@ public: for (auto *SpecializedFunc : CurrentSpecializations) { SpecializedFuncs.insert(SpecializedFunc); - // TODO: If we want to support specializing specialized functions, - // initialize here the state of the newly created functions, marking - // them argument-tracked and executable. + // Initialize the state of the newly created functions, marking them + // argument-tracked and executable. + if (SpecializedFunc->hasExactDefinition() && + !SpecializedFunc->hasFnAttribute(Attribute::Naked)) + Solver.addTrackedFunction(SpecializedFunc); + Solver.addArgumentTrackedFunction(SpecializedFunc); + FuncDecls.push_back(SpecializedFunc); + Solver.markBlockExecutable(&SpecializedFunc->front()); // Replace the function arguments for the specialized functions. for (Argument &Arg : SpecializedFunc->args()) @@ -138,12 +300,22 @@ public: const ValueLatticeElement &IV = Solver.getLatticeValueFor(V); if (isOverdefined(IV)) return false; - auto *Const = IV.isConstant() ? Solver.getConstant(IV) - : UndefValue::get(V->getType()); + auto *Const = + isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType()); V->replaceAllUsesWith(Const); - // TODO: Update the solver here if we want to specialize specialized - // functions. + for (auto *U : Const->users()) + if (auto *I = dyn_cast<Instruction>(U)) + if (Solver.isBlockExecutable(I->getParent())) + Solver.visit(I); + + // Remove the instruction from Block and Solver. + if (auto *I = dyn_cast<Instruction>(V)) { + if (I->isSafeToRemove()) { + I->eraseFromParent(); + Solver.removeLatticeValueFor(I); + } + } return true; } @@ -152,6 +324,15 @@ private: // also in the cost model. unsigned NbFunctionsSpecialized = 0; + /// Clone the function \p F and remove the ssa_copy intrinsics added by + /// the SCCPSolver in the cloned version. + Function *cloneCandidateFunction(Function *F) { + ValueToValueMapTy EmptyMap; + Function *Clone = CloneFunction(F, EmptyMap); + removeSSACopy(*Clone); + return Clone; + } + /// This function decides whether to specialize function \p F based on the /// known constant values its arguments can take on. Specialization is /// performed on the first interesting argument. Specializations based on @@ -162,9 +343,8 @@ private: SmallVectorImpl<Function *> &Specializations) { // Do not specialize the cloned function again. - if (SpecializedFuncs.contains(F)) { + if (SpecializedFuncs.contains(F)) return false; - } // If we're optimizing the function for size, we shouldn't specialize it. if (F->hasOptSize() || @@ -176,8 +356,25 @@ private: if (!Solver.isBlockExecutable(&F->getEntryBlock())) return false; + // It wastes time to specialize a function which would get inlined finally. + if (F->hasFnAttribute(Attribute::AlwaysInline)) + return false; + LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() << "\n"); + + // Determine if it would be profitable to create a specialization of the + // function where the argument takes on the given constant value. If so, + // add the constant to Constants. + auto FnSpecCost = getSpecializationCost(F); + if (!FnSpecCost.isValid()) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: "; + FnSpecCost.print(dbgs()); dbgs() << "\n"); + // Determine if we should specialize the function based on the values the // argument can take on. If specialization is not profitable, we continue // on to the next argument. @@ -195,7 +392,7 @@ private: // be set to false by isArgumentInteresting (that function only adds // values to the Constants list that are deemed profitable). SmallVector<Constant *, 4> Constants; - if (!isArgumentInteresting(&A, Constants, IsPartial)) { + if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) { LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n"); continue; } @@ -214,8 +411,7 @@ private: for (auto *C : Constants) { // Clone the function. We leave the ValueToValueMap empty to allow // IPSCCP to propagate the constant arguments. - ValueToValueMapTy EmptyMap; - Function *Clone = CloneFunction(F, EmptyMap); + Function *Clone = cloneCandidateFunction(F); Argument *ClonedArg = Clone->arg_begin() + A.getArgNo(); // Rewrite calls to the function so that they call the clone instead. @@ -231,9 +427,10 @@ private: NbFunctionsSpecialized++; } - // TODO: if we want to support specialize specialized functions, and if - // the function has been completely specialized, the original function is - // no longer needed, so we would need to mark it unreachable here. + // If the function has been completely specialized, the original function + // is no longer needed. Mark it unreachable. + if (!IsPartial) + Solver.markFunctionUnreachable(F); // FIXME: Only one argument per function. return true; @@ -253,7 +450,11 @@ private: // If the code metrics reveal that we shouldn't duplicate the function, we // shouldn't specialize it. Set the specialization cost to Invalid. - if (Metrics.notDuplicatable) { + // Or if the lines of codes implies that this function is easy to get + // inlined so that we shouldn't specialize it. + if (Metrics.notDuplicatable || + (!ForceFunctionSpecialization && + Metrics.NumInsts < SmallFunctionThreshold)) { InstructionCost C{}; C.setInvalid(); return C; @@ -379,9 +580,8 @@ private: /// argument. bool isArgumentInteresting(Argument *A, SmallVectorImpl<Constant *> &Constants, + const InstructionCost &FnSpecCost, bool &IsPartial) { - Function *F = A->getParent(); - // For now, don't attempt to specialize functions based on the values of // composite types. if (!A->getType()->isSingleValueType() || A->user_empty()) @@ -420,18 +620,6 @@ private: return false; } - // Determine if it would be profitable to create a specialization of the - // function where the argument takes on the given constant value. If so, - // add the constant to Constants. - auto FnSpecCost = getSpecializationCost(F); - if (!FnSpecCost.isValid()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: "; - FnSpecCost.print(dbgs()); dbgs() << "\n"); - for (auto *C : PossibleConstants) { LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n"); if (ForceFunctionSpecialization) { @@ -475,6 +663,12 @@ private: if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) continue; auto &CS = *cast<CallBase>(U); + // If the call site has attribute minsize set, that callsite won't be + // specialized. + if (CS.hasFnAttr(Attribute::MinSize)) { + AllConstant = false; + continue; + } // If the parent of the call site will never be executed, we don't need // to worry about the passed value. @@ -482,11 +676,25 @@ private: continue; auto *V = CS.getArgOperand(A->getArgNo()); + if (isa<PoisonValue>(V)) + return false; + + // For now, constant expressions are fine but only if they are function + // calls. + if (auto *CE = dyn_cast<ConstantExpr>(V)) + if (!isa<Function>(CE->getOperand(0))) + return false; + // TrackValueOfGlobalVariable only tracks scalar global variables. if (auto *GV = dyn_cast<GlobalVariable>(V)) { - if (!GV->getValueType()->isSingleValueType()) { + // Check if we want to specialize on the address of non-constant + // global values. + if (!GV->isConstant()) + if (!SpecializeOnAddresses) + return false; + + if (!GV->getValueType()->isSingleValueType()) return false; - } } if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() || @@ -506,6 +714,9 @@ private: /// This function modifies calls to function \p F whose argument at index \p /// ArgNo is equal to constant \p C. The calls are rewritten to call function /// \p Clone instead. + /// + /// Callsites that have been marked with the MinSize function attribute won't + /// be specialized and rewritten. void rewriteCallSites(Function *F, Function *Clone, Argument &Arg, Constant *C) { unsigned ArgNo = Arg.getArgNo(); @@ -527,24 +738,7 @@ private: } } }; - -/// Function to clean up the left over intrinsics from SCCP util. -static void cleanup(Module &M) { - for (Function &F : M) { - for (BasicBlock &BB : F) { - for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { - Instruction *Inst = &*BI++; - if (auto *II = dyn_cast<IntrinsicInst>(Inst)) { - if (II->getIntrinsicID() == Intrinsic::ssa_copy) { - Value *Op = II->getOperand(0); - Inst->replaceAllUsesWith(Op); - Inst->eraseFromParent(); - } - } - } - } - } -} +} // namespace bool llvm::runFunctionSpecialization( Module &M, const DataLayout &DL, @@ -597,12 +791,27 @@ bool llvm::runFunctionSpecialization( Solver.trackValueOfGlobalVariable(&G); } + auto &TrackedFuncs = Solver.getArgumentTrackedFunctions(); + SmallVector<Function *, 16> FuncDecls(TrackedFuncs.begin(), + TrackedFuncs.end()); + + // No tracked functions, so nothing to do: don't run the solver and remove + // the ssa_copy intrinsics that may have been introduced. + if (TrackedFuncs.empty()) { + removeSSACopy(M); + return false; + } + // Solve for constants. auto RunSCCPSolver = [&](auto &WorkList) { bool ResolvedUndefs = true; while (ResolvedUndefs) { + // Not running the solver unnecessary is checked in regression test + // nothing-to-do.ll, so if this debug message is changed, this regression + // test needs updating too. LLVM_DEBUG(dbgs() << "FnSpecialization: Running solver\n"); + Solver.solve(); LLVM_DEBUG(dbgs() << "FnSpecialization: Resolving undefs\n"); ResolvedUndefs = false; @@ -615,15 +824,14 @@ bool llvm::runFunctionSpecialization( for (BasicBlock &BB : *F) { if (!Solver.isBlockExecutable(&BB)) continue; + // FIXME: The solver may make changes to the function here, so set + // Changed, even if later function specialization does not trigger. for (auto &I : make_early_inc_range(BB)) - FS.tryToReplaceWithConstant(&I); + Changed |= FS.tryToReplaceWithConstant(&I); } } }; - auto &TrackedFuncs = Solver.getArgumentTrackedFunctions(); - SmallVector<Function *, 16> FuncDecls(TrackedFuncs.begin(), - TrackedFuncs.end()); #ifndef NDEBUG LLVM_DEBUG(dbgs() << "FnSpecialization: Worklist fn decls:\n"); for (auto *F : FuncDecls) @@ -637,14 +845,18 @@ bool llvm::runFunctionSpecialization( unsigned I = 0; while (FuncSpecializationMaxIters != I++ && FS.specializeFunctions(FuncDecls, CurrentSpecializations)) { - // TODO: run the solver here for the specialized functions only if we want - // to specialize recursively. + + // Run the solver for the specialized functions. + RunSCCPSolver(CurrentSpecializations); + + // Replace some unresolved constant arguments. + constantArgPropagation(FuncDecls, M, Solver); CurrentSpecializations.clear(); Changed = true; } // Clean up the IR by removing ssa_copy intrinsics. - cleanup(M); + removeSSACopy(M); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp index fb4cb23b837e..5e5d2086adc2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -88,7 +88,7 @@ ModulePass *llvm::createGlobalDCEPass() { static bool isEmptyFunction(Function *F) { BasicBlock &Entry = F->getEntryBlock(); for (auto &I : Entry) { - if (isa<DbgInfoIntrinsic>(I)) + if (I.isDebugOrPseudoInst()) continue; if (auto *RI = dyn_cast<ReturnInst>(&I)) return !RI->getReturnValue(); @@ -210,7 +210,7 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId, Constant *Ptr = getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset, - *Caller->getParent()); + *Caller->getParent(), VTable); if (!Ptr) { LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n"); VFESafeVTables.erase(VTable); @@ -416,6 +416,16 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { // virtual function pointers with null, allowing us to remove the // function itself. ++NumVFuncs; + + // Detect vfuncs that are referenced as "relative pointers" which are used + // in Swift vtables, i.e. entries in the form of: + // + // i32 trunc (i64 sub (i64 ptrtoint @f, i64 ptrtoint ...)) to i32) + // + // In this case, replace the whole "sub" expression with constant 0 to + // avoid leaving a weird sub(0, symbol) expression behind. + replaceRelativePointerUsersWithZero(F); + F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType())); } EraseUnusedGlobalValue(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 8750eb9ecc4e..b2c2efed7db8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -208,9 +208,7 @@ CleanupPointerRootUsers(GlobalVariable *GV, SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead; // Constants can't be pointers to dynamically allocated memory. - for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end(); - UI != E;) { - User *U = *UI++; + for (User *U : llvm::make_early_inc_range(GV->users())) { if (StoreInst *SI = dyn_cast<StoreInst>(U)) { Value *V = SI->getValueOperand(); if (isa<Constant>(V)) { @@ -703,8 +701,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, !ICmpInst::isSigned(cast<ICmpInst>(U)->getPredicate()) && isa<LoadInst>(U->getOperand(0)) && isa<ConstantPointerNull>(U->getOperand(1))) { - assert(isa<GlobalValue>( - cast<LoadInst>(U->getOperand(0))->getPointerOperand()) && + assert(isa<GlobalValue>(cast<LoadInst>(U->getOperand(0)) + ->getPointerOperand() + ->stripPointerCasts()) && "Should be GlobalVariable"); // This and only this kind of non-signed ICmpInst is to be replaced with // the comparing of the value of the created global init bool later in @@ -720,22 +719,55 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, /// Return true if all uses of any loads from GV will trap if the loaded value /// is null. Note that this also permits comparisons of the loaded value /// against null, as a special case. -static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { - for (const User *U : GV->users()) - if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { - SmallPtrSet<const PHINode*, 8> PHIs; - if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) +static bool allUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { + SmallVector<const Value *, 4> Worklist; + Worklist.push_back(GV); + while (!Worklist.empty()) { + const Value *P = Worklist.pop_back_val(); + for (auto *U : P->users()) { + if (auto *LI = dyn_cast<LoadInst>(U)) { + SmallPtrSet<const PHINode *, 8> PHIs; + if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) + return false; + } else if (auto *SI = dyn_cast<StoreInst>(U)) { + // Ignore stores to the global. + if (SI->getPointerOperand() != P) + return false; + } else if (auto *CE = dyn_cast<ConstantExpr>(U)) { + if (CE->stripPointerCasts() != GV) + return false; + // Check further the ConstantExpr. + Worklist.push_back(CE); + } else { + // We don't know or understand this user, bail out. return false; - } else if (isa<StoreInst>(U)) { - // Ignore stores to the global. - } else { - // We don't know or understand this user, bail out. - //cerr << "UNKNOWN USER OF GLOBAL!: " << *U; - return false; + } } + } + return true; } +/// Get all the loads/store uses for global variable \p GV. +static void allUsesOfLoadAndStores(GlobalVariable *GV, + SmallVector<Value *, 4> &Uses) { + SmallVector<Value *, 4> Worklist; + Worklist.push_back(GV); + while (!Worklist.empty()) { + auto *P = Worklist.pop_back_val(); + for (auto *U : P->users()) { + if (auto *CE = dyn_cast<ConstantExpr>(U)) { + Worklist.push_back(CE); + continue; + } + + assert((isa<LoadInst>(U) || isa<StoreInst>(U)) && + "Expect only load or store instructions"); + Uses.push_back(U); + } + } +} + static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { bool Changed = false; for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) { @@ -817,8 +849,7 @@ static bool OptimizeAwayTrappingUsesOfLoads( bool AllNonStoreUsesGone = true; // Replace all uses of loads with uses of uses of the stored value. - for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){ - User *GlobalUser = *GUI++; + for (User *GlobalUser : llvm::make_early_inc_range(GV->users())) { if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) { Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV); // If we were able to delete all uses of the loads @@ -934,9 +965,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, } } - Constant *RepValue = NewGV; - if (NewGV->getType() != GV->getValueType()) - RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType()); + SmallPtrSet<Constant *, 1> RepValues; + RepValues.insert(NewGV); // If there is a comparison against null, we will insert a global bool to // keep track of whether the global was initialized yet or not. @@ -947,9 +977,11 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, GV->getName()+".init", GV->getThreadLocalMode()); bool InitBoolUsed = false; - // Loop over all uses of GV, processing them in turn. - while (!GV->use_empty()) { - if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) { + // Loop over all instruction uses of GV, processing them in turn. + SmallVector<Value *, 4> Guses; + allUsesOfLoadAndStores(GV, Guses); + for (auto *U : Guses) { + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { // The global is initialized when the store to it occurs. If the stored // value is null value, the global bool is set to false, otherwise true. new StoreInst(ConstantInt::getBool( @@ -961,12 +993,14 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, continue; } - LoadInst *LI = cast<LoadInst>(GV->user_back()); + LoadInst *LI = cast<LoadInst>(U); while (!LI->use_empty()) { Use &LoadUse = *LI->use_begin(); ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser()); if (!ICI) { - LoadUse = RepValue; + auto *CE = ConstantExpr::getBitCast(NewGV, LI->getType()); + RepValues.insert(CE); + LoadUse.set(CE); continue; } @@ -1012,40 +1046,53 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, // To further other optimizations, loop over all users of NewGV and try to // constant prop them. This will promote GEP instructions with constant // indices into GEP constant-exprs, which will allow global-opt to hack on it. - ConstantPropUsersOf(NewGV, DL, TLI); - if (RepValue != NewGV) - ConstantPropUsersOf(RepValue, DL, TLI); + for (auto *CE : RepValues) + ConstantPropUsersOf(CE, DL, TLI); return NewGV; } -/// Scan the use-list of V checking to make sure that there are no complex uses -/// of V. We permit simple things like dereferencing the pointer, but not +/// Scan the use-list of GV checking to make sure that there are no complex uses +/// of GV. We permit simple things like dereferencing the pointer, but not /// storing through the address, unless it is to the specified global. static bool -valueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, +valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI, const GlobalVariable *GV) { - for (const User *U : V->users()) { - const Instruction *Inst = cast<Instruction>(U); + SmallPtrSet<const Value *, 4> Visited; + SmallVector<const Value *, 4> Worklist; + Worklist.push_back(CI); - if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) { - continue; // Fine, ignore. - } + while (!Worklist.empty()) { + const Value *V = Worklist.pop_back_val(); + if (!Visited.insert(V).second) + continue; - if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (SI->getOperand(0) == V && SI->getOperand(1) != GV) - return false; // Storing the pointer itself... bad. - continue; // Otherwise, storing through it, or storing into GV... fine. - } + for (const Use &VUse : V->uses()) { + const User *U = VUse.getUser(); + if (isa<LoadInst>(U) || isa<CmpInst>(U)) + continue; // Fine, ignore. - if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) { - if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV)) - return false; - continue; - } + if (auto *SI = dyn_cast<StoreInst>(U)) { + if (SI->getValueOperand() == V && + SI->getPointerOperand()->stripPointerCasts() != GV) + return false; // Storing the pointer not into GV... bad. + continue; // Otherwise, storing through it, or storing into GV... fine. + } - return false; + if (auto *BCI = dyn_cast<BitCastInst>(U)) { + Worklist.push_back(BCI); + continue; + } + + if (auto *GEPI = dyn_cast<GetElementPtrInst>(U)) { + Worklist.push_back(GEPI); + continue; + } + + return false; + } } + return true; } @@ -1066,12 +1113,12 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, // been reached). To do this, we check to see if all uses of the global // would trap if the global were null: this proves that they must all // happen after the malloc. - if (!AllUsesOfLoadedValueWillTrapIfNull(GV)) + if (!allUsesOfLoadedValueWillTrapIfNull(GV)) return false; // We can't optimize this if the malloc itself is used in a complex way, // for example, being stored into multiple globals. This allows the - // malloc to be stored into the specified global, loaded icmp'd. + // malloc to be stored into the specified global, loaded, gep, icmp'd. // These are all things we could transform to using the global for. if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV)) return false; @@ -1112,6 +1159,7 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, // value was null. if (GV->getInitializer()->getType()->isPointerTy() && GV->getInitializer()->isNullValue() && + StoredOnceVal->getType()->isPointerTy() && !NullPointerIsDefined( nullptr /* F */, GV->getInitializer()->getType()->getPointerAddressSpace())) { @@ -1442,8 +1490,7 @@ static void makeAllConstantUsesInstructions(Constant *C) { append_range(UUsers, U->users()); for (auto *UU : UUsers) { Instruction *UI = cast<Instruction>(UU); - Instruction *NewU = U->getAsInstruction(); - NewU->insertBefore(UI); + Instruction *NewU = U->getAsInstruction(UI); UI->replaceUsesOfWith(U, NewU); } // We've replaced all the uses, so destroy the constant. (destroyConstant @@ -1456,6 +1503,7 @@ static void makeAllConstantUsesInstructions(Constant *C) { /// it if possible. If we make a change, return true. static bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, + function_ref<TargetTransformInfo &(Function &)> GetTTI, function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<DominatorTree &(Function &)> LookupDomTree) { auto &DL = GV->getParent()->getDataLayout(); @@ -1554,43 +1602,57 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, if (SRAGlobal(GV, DL)) return true; } - if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) { + Value *StoredOnceValue = GS.getStoredOnceValue(); + if (GS.StoredType == GlobalStatus::StoredOnce && StoredOnceValue) { + // Avoid speculating constant expressions that might trap (div/rem). + auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue); + if (SOVConstant && SOVConstant->canTrap()) + return Changed; + + Function &StoreFn = + const_cast<Function &>(*GS.StoredOnceStore->getFunction()); + bool CanHaveNonUndefGlobalInitializer = + GetTTI(StoreFn).canHaveNonUndefGlobalInitializerInAddressSpace( + GV->getType()->getAddressSpace()); // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the // initializer to be the stored value, then delete all stores to the // global. This allows us to mark it constant. - if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) - if (isa<UndefValue>(GV->getInitializer())) { - // Change the initial value here. - GV->setInitializer(SOVConstant); - - // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); - - if (GV->use_empty()) { - LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to " - << "simplify all users and delete global!\n"); - GV->eraseFromParent(); - ++NumDeleted; - } - ++NumSubstitute; - return true; + // This is restricted to address spaces that allow globals to have + // initializers. NVPTX, for example, does not support initializers for + // shared memory (AS 3). + if (SOVConstant && SOVConstant->getType() == GV->getValueType() && + isa<UndefValue>(GV->getInitializer()) && + CanHaveNonUndefGlobalInitializer) { + // Change the initial value here. + GV->setInitializer(SOVConstant); + + // Clean up any obviously simplifiable users now. + CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + + if (GV->use_empty()) { + LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to " + << "simplify all users and delete global!\n"); + GV->eraseFromParent(); + ++NumDeleted; } + ++NumSubstitute; + return true; + } // Try to optimize globals based on the knowledge that only one value // (besides its initializer) is ever stored to the global. - if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, - GetTLI)) + if (optimizeOnceStoredGlobal(GV, StoredOnceValue, GS.Ordering, DL, GetTLI)) return true; // Otherwise, if the global was not a boolean, we can shrink it to be a - // boolean. - if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) { - if (GS.Ordering == AtomicOrdering::NotAtomic) { - if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { - ++NumShrunkToBool; - return true; - } + // boolean. Skip this optimization for AS that doesn't allow an initializer. + if (SOVConstant && GS.Ordering == AtomicOrdering::NotAtomic && + (!isa<UndefValue>(GV->getInitializer()) || + CanHaveNonUndefGlobalInitializer)) { + if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { + ++NumShrunkToBool; + return true; } } } @@ -1602,6 +1664,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, /// make a change, return true. static bool processGlobal(GlobalValue &GV, + function_ref<TargetTransformInfo &(Function &)> GetTTI, function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<DominatorTree &(Function &)> LookupDomTree) { if (GV.getName().startswith("llvm.")) @@ -1634,7 +1697,8 @@ processGlobal(GlobalValue &GV, if (GVar->isConstant() || !GVar->hasInitializer()) return Changed; - return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed; + return processInternalGlobal(GVar, GS, GetTTI, GetTLI, LookupDomTree) || + Changed; } /// Walk all of the direct calls of the specified function, changing them to @@ -1651,7 +1715,7 @@ static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs, Attribute::AttrKind A) { unsigned AttrIndex; if (Attrs.hasAttrSomewhere(A, &AttrIndex)) - return Attrs.removeAttribute(C, AttrIndex, A); + return Attrs.removeAttributeAtIndex(C, AttrIndex, A); return Attrs; } @@ -1864,10 +1928,8 @@ static void RemovePreallocated(Function *F) { Value *AllocaReplacement = ArgAllocas[AllocArgIndex]; if (!AllocaReplacement) { auto AddressSpace = UseCall->getType()->getPointerAddressSpace(); - auto *ArgType = UseCall - ->getAttribute(AttributeList::FunctionIndex, - Attribute::Preallocated) - .getValueAsType(); + auto *ArgType = + UseCall->getFnAttr(Attribute::Preallocated).getValueAsType(); auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction(); Builder.SetInsertPoint(InsertBefore); auto *Alloca = @@ -1897,26 +1959,22 @@ OptimizeFunctions(Module &M, bool Changed = false; std::vector<Function *> AllCallsCold; - for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) { - Function *F = &*FI++; - if (hasOnlyColdCalls(*F, GetBFI)) - AllCallsCold.push_back(F); - } + for (Function &F : llvm::make_early_inc_range(M)) + if (hasOnlyColdCalls(F, GetBFI)) + AllCallsCold.push_back(&F); // Optimize functions. - for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { - Function *F = &*FI++; - + for (Function &F : llvm::make_early_inc_range(M)) { // Don't perform global opt pass on naked functions; we don't want fast // calling conventions for naked functions. - if (F->hasFnAttribute(Attribute::Naked)) + if (F.hasFnAttribute(Attribute::Naked)) continue; // Functions without names cannot be referenced outside this module. - if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) - F->setLinkage(GlobalValue::InternalLinkage); + if (!F.hasName() && !F.isDeclaration() && !F.hasLocalLinkage()) + F.setLinkage(GlobalValue::InternalLinkage); - if (deleteIfDead(*F, NotDiscardableComdats)) { + if (deleteIfDead(F, NotDiscardableComdats)) { Changed = true; continue; } @@ -1931,17 +1989,17 @@ OptimizeFunctions(Module &M, // some more complicated logic to break these cycles. // Removing unreachable blocks might invalidate the dominator so we // recalculate it. - if (!F->isDeclaration()) { - if (removeUnreachableBlocks(*F)) { - auto &DT = LookupDomTree(*F); - DT.recalculate(*F); + if (!F.isDeclaration()) { + if (removeUnreachableBlocks(F)) { + auto &DT = LookupDomTree(F); + DT.recalculate(F); Changed = true; } } - Changed |= processGlobal(*F, GetTLI, LookupDomTree); + Changed |= processGlobal(F, GetTTI, GetTLI, LookupDomTree); - if (!F->hasLocalLinkage()) + if (!F.hasLocalLinkage()) continue; // If we have an inalloca parameter that we can safely remove the @@ -1949,56 +2007,55 @@ OptimizeFunctions(Module &M, // wouldn't be safe in the presence of inalloca. // FIXME: We should also hoist alloca affected by this to the entry // block if possible. - if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) && - !F->hasAddressTaken() && !hasMustTailCallers(F)) { - RemoveAttribute(F, Attribute::InAlloca); + if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) && + !F.hasAddressTaken() && !hasMustTailCallers(&F)) { + RemoveAttribute(&F, Attribute::InAlloca); Changed = true; } // FIXME: handle invokes // FIXME: handle musttail - if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) { - if (!F->hasAddressTaken() && !hasMustTailCallers(F) && - !hasInvokeCallers(F)) { - RemovePreallocated(F); + if (F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) { + if (!F.hasAddressTaken() && !hasMustTailCallers(&F) && + !hasInvokeCallers(&F)) { + RemovePreallocated(&F); Changed = true; } continue; } - if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { + if (hasChangeableCC(&F) && !F.isVarArg() && !F.hasAddressTaken()) { NumInternalFunc++; - TargetTransformInfo &TTI = GetTTI(*F); + TargetTransformInfo &TTI = GetTTI(F); // Change the calling convention to coldcc if either stress testing is // enabled or the target would like to use coldcc on functions which are // cold at all call sites and the callers contain no other non coldcc // calls. if (EnableColdCCStressTest || - (TTI.useColdCCForColdCall(*F) && - isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) { - F->setCallingConv(CallingConv::Cold); - changeCallSitesToColdCC(F); + (TTI.useColdCCForColdCall(F) && + isValidCandidateForColdCC(F, GetBFI, AllCallsCold))) { + F.setCallingConv(CallingConv::Cold); + changeCallSitesToColdCC(&F); Changed = true; NumColdCC++; } } - if (hasChangeableCC(F) && !F->isVarArg() && - !F->hasAddressTaken()) { + if (hasChangeableCC(&F) && !F.isVarArg() && !F.hasAddressTaken()) { // If this function has a calling convention worth changing, is not a // varargs function, and is only called directly, promote it to use the // Fast calling convention. - F->setCallingConv(CallingConv::Fast); - ChangeCalleesToFastCall(F); + F.setCallingConv(CallingConv::Fast); + ChangeCalleesToFastCall(&F); ++NumFastCallFns; Changed = true; } - if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && - !F->hasAddressTaken()) { + if (F.getAttributes().hasAttrSomewhere(Attribute::Nest) && + !F.hasAddressTaken()) { // The function is not used by a trampoline intrinsic, so it is safe // to remove the 'nest' attribute. - RemoveAttribute(F, Attribute::Nest); + RemoveAttribute(&F, Attribute::Nest); ++NumNestRemoved; Changed = true; } @@ -2008,35 +2065,34 @@ OptimizeFunctions(Module &M, static bool OptimizeGlobalVars(Module &M, + function_ref<TargetTransformInfo &(Function &)> GetTTI, function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<DominatorTree &(Function &)> LookupDomTree, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) { bool Changed = false; - for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); - GVI != E; ) { - GlobalVariable *GV = &*GVI++; + for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { // Global variables without names cannot be referenced outside this module. - if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) - GV->setLinkage(GlobalValue::InternalLinkage); + if (!GV.hasName() && !GV.isDeclaration() && !GV.hasLocalLinkage()) + GV.setLinkage(GlobalValue::InternalLinkage); // Simplify the initializer. - if (GV->hasInitializer()) - if (auto *C = dyn_cast<Constant>(GV->getInitializer())) { + if (GV.hasInitializer()) + if (auto *C = dyn_cast<Constant>(GV.getInitializer())) { auto &DL = M.getDataLayout(); // TLI is not used in the case of a Constant, so use default nullptr // for that optional parameter, since we don't have a Function to // provide GetTLI anyway. Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr); if (New != C) - GV->setInitializer(New); + GV.setInitializer(New); } - if (deleteIfDead(*GV, NotDiscardableComdats)) { + if (deleteIfDead(GV, NotDiscardableComdats)) { Changed = true; continue; } - Changed |= processGlobal(*GV, GetTLI, LookupDomTree); + Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree); } return Changed; } @@ -2425,24 +2481,21 @@ OptimizeGlobalAliases(Module &M, for (GlobalValue *GV : Used.used()) Used.compilerUsedErase(GV); - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E;) { - GlobalAlias *J = &*I++; - + for (GlobalAlias &J : llvm::make_early_inc_range(M.aliases())) { // Aliases without names cannot be referenced outside this module. - if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) - J->setLinkage(GlobalValue::InternalLinkage); + if (!J.hasName() && !J.isDeclaration() && !J.hasLocalLinkage()) + J.setLinkage(GlobalValue::InternalLinkage); - if (deleteIfDead(*J, NotDiscardableComdats)) { + if (deleteIfDead(J, NotDiscardableComdats)) { Changed = true; continue; } // If the alias can change at link time, nothing can be done - bail out. - if (J->isInterposable()) + if (J.isInterposable()) continue; - Constant *Aliasee = J->getAliasee(); + Constant *Aliasee = J.getAliasee(); GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts()); // We can't trivially replace the alias with the aliasee if the aliasee is // non-trivial in some way. We also can't replace the alias with the aliasee @@ -2455,31 +2508,31 @@ OptimizeGlobalAliases(Module &M, // Make all users of the alias use the aliasee instead. bool RenameTarget; - if (!hasUsesToReplace(*J, Used, RenameTarget)) + if (!hasUsesToReplace(J, Used, RenameTarget)) continue; - J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType())); + J.replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J.getType())); ++NumAliasesResolved; Changed = true; if (RenameTarget) { // Give the aliasee the name, linkage and other attributes of the alias. - Target->takeName(&*J); - Target->setLinkage(J->getLinkage()); - Target->setDSOLocal(J->isDSOLocal()); - Target->setVisibility(J->getVisibility()); - Target->setDLLStorageClass(J->getDLLStorageClass()); + Target->takeName(&J); + Target->setLinkage(J.getLinkage()); + Target->setDSOLocal(J.isDSOLocal()); + Target->setVisibility(J.getVisibility()); + Target->setDLLStorageClass(J.getDLLStorageClass()); - if (Used.usedErase(&*J)) + if (Used.usedErase(&J)) Used.usedInsert(Target); - if (Used.compilerUsedErase(&*J)) + if (Used.compilerUsedErase(&J)) Used.compilerUsedInsert(Target); - } else if (mayHaveOtherReferences(*J, Used)) + } else if (mayHaveOtherReferences(J, Used)) continue; // Delete the alias. - M.getAliasList().erase(J); + M.getAliasList().erase(&J); ++NumAliasesRemoved; Changed = true; } @@ -2526,7 +2579,7 @@ static bool cxxDtorIsEmpty(const Function &Fn) { return false; for (auto &I : Fn.getEntryBlock()) { - if (isa<DbgInfoIntrinsic>(I)) + if (I.isDebugOrPseudoInst()) continue; if (isa<ReturnInst>(I)) return true; @@ -2552,12 +2605,11 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { // and remove them. bool Changed = false; - for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end(); - I != E;) { + for (User *U : llvm::make_early_inc_range(CXAAtExitFn->users())) { // We're only interested in calls. Theoretically, we could handle invoke // instructions as well, but neither llvm-gcc nor clang generate invokes // to __cxa_atexit. - CallInst *CI = dyn_cast<CallInst>(*I++); + CallInst *CI = dyn_cast<CallInst>(U); if (!CI) continue; @@ -2614,8 +2666,8 @@ static bool optimizeGlobalsInModule( }); // Optimize non-address-taken globals. - LocalChange |= - OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats); + LocalChange |= OptimizeGlobalVars(M, GetTTI, GetTLI, LookupDomTree, + NotDiscardableComdats); // Resolve aliases, when possible. LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp index 365b269dc3bf..e7d698c42fcf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp @@ -154,11 +154,8 @@ static bool splitGlobals(Module &M) { return false; bool Changed = false; - for (auto I = M.global_begin(); I != M.global_end();) { - GlobalVariable &GV = *I; - ++I; + for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) Changed |= splitGlobal(GV); - } return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp index adf9ffba5780..b8a314c54f18 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DIBuilder.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" @@ -33,6 +34,10 @@ using namespace llvm; using namespace IRSimilarity; +// A command flag to be used for debugging to exclude branches from similarity +// matching and outlining. +extern cl::opt<bool> DisableBranches; + // Set to true if the user wants the ir outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr // functions. Since the outliner is confined to a single module (modulo LTO), @@ -71,8 +76,12 @@ struct OutlinableGroup { /// for extraction. bool IgnoreGroup = false; - /// The return block for the overall function. - BasicBlock *EndBB = nullptr; + /// The return blocks for the overall function. + DenseMap<Value *, BasicBlock *> EndBBs; + + /// The PHIBlocks with their corresponding return block based on the return + /// value as the key. + DenseMap<Value *, BasicBlock *> PHIBlocks; /// A set containing the different GVN store sets needed. Each array contains /// a sorted list of the different values that need to be stored into output @@ -87,6 +96,14 @@ struct OutlinableGroup { /// index in ArgumentTypes is an output argument. unsigned NumAggregateInputs = 0; + /// The mapping of the canonical numbering of the values in outlined sections + /// to specific arguments. + DenseMap<unsigned, unsigned> CanonicalNumberToAggArg; + + /// The number of branches in the region target a basic block that is outside + /// of the region. + unsigned BranchesToOutside = 0; + /// The number of instructions that will be outlined by extracting \ref /// Regions. InstructionCost Benefit = 0; @@ -118,20 +135,67 @@ struct OutlinableGroup { /// \param SourceBB - the BasicBlock to pull Instructions from. /// \param TargetBB - the BasicBlock to put Instruction into. static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) { - BasicBlock::iterator BBCurr, BBEnd, BBNext; - for (BBCurr = SourceBB.begin(), BBEnd = SourceBB.end(); BBCurr != BBEnd; - BBCurr = BBNext) { - BBNext = std::next(BBCurr); - BBCurr->moveBefore(TargetBB, TargetBB.end()); - } + for (Instruction &I : llvm::make_early_inc_range(SourceBB)) + I.moveBefore(TargetBB, TargetBB.end()); +} + +/// A function to sort the keys of \p Map, which must be a mapping of constant +/// values to basic blocks and return it in \p SortedKeys +/// +/// \param SortedKeys - The vector the keys will be return in and sorted. +/// \param Map - The DenseMap containing keys to sort. +static void getSortedConstantKeys(std::vector<Value *> &SortedKeys, + DenseMap<Value *, BasicBlock *> &Map) { + for (auto &VtoBB : Map) + SortedKeys.push_back(VtoBB.first); + + stable_sort(SortedKeys, [](const Value *LHS, const Value *RHS) { + const ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS); + const ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS); + assert(RHSC && "Not a constant integer in return value?"); + assert(LHSC && "Not a constant integer in return value?"); + + return LHSC->getLimitedValue() < RHSC->getLimitedValue(); + }); +} + +Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other, + Value *V) { + Optional<unsigned> GVN = Candidate->getGVN(V); + assert(GVN.hasValue() && "No GVN for incoming value"); + Optional<unsigned> CanonNum = Candidate->getCanonicalNum(*GVN); + Optional<unsigned> FirstGVN = Other.Candidate->fromCanonicalNum(*CanonNum); + Optional<Value *> FoundValueOpt = Other.Candidate->fromGVN(*FirstGVN); + return FoundValueOpt.getValueOr(nullptr); } void OutlinableRegion::splitCandidate() { assert(!CandidateSplit && "Candidate already split!"); + Instruction *BackInst = Candidate->backInstruction(); + + Instruction *EndInst = nullptr; + // Check whether the last instruction is a terminator, if it is, we do + // not split on the following instruction. We leave the block as it is. We + // also check that this is not the last instruction in the Module, otherwise + // the check for whether the current following instruction matches the + // previously recorded instruction will be incorrect. + if (!BackInst->isTerminator() || + BackInst->getParent() != &BackInst->getFunction()->back()) { + EndInst = Candidate->end()->Inst; + assert(EndInst && "Expected an end instruction?"); + } + + // We check if the current instruction following the last instruction in the + // region is the same as the recorded instruction following the last + // instruction. If they do not match, there could be problems in rewriting + // the program after outlining, so we ignore it. + if (!BackInst->isTerminator() && + EndInst != BackInst->getNextNonDebugInstruction()) + return; + Instruction *StartInst = (*Candidate->begin()).Inst; - Instruction *EndInst = (*Candidate->end()).Inst; - assert(StartInst && EndInst && "Expected a start and end instruction?"); + assert(StartInst && "Expected a start instruction?"); StartBB = StartInst->getParent(); PrevBB = StartBB; @@ -153,13 +217,20 @@ void OutlinableRegion::splitCandidate() { std::string OriginalName = PrevBB->getName().str(); StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline"); - - // This is the case for the inner block since we do not have to include - // multiple blocks. - EndBB = StartBB; - FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); + PrevBB->replaceSuccessorsPhiUsesWith(PrevBB, StartBB); CandidateSplit = true; + if (!BackInst->isTerminator()) { + EndBB = EndInst->getParent(); + FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); + EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB); + FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB); + return; + } + + EndBB = BackInst->getParent(); + EndsInBranch = true; + FollowBB = nullptr; } void OutlinableRegion::reattachCandidate() { @@ -180,7 +251,6 @@ void OutlinableRegion::reattachCandidate() { // inst3 // inst4 assert(StartBB != nullptr && "StartBB for Candidate is not defined!"); - assert(FollowBB != nullptr && "StartBB for Candidate is not defined!"); // StartBB should only have one predecessor since we put an unconditional // branch at the end of PrevBB when we split the BasicBlock. @@ -189,21 +259,24 @@ void OutlinableRegion::reattachCandidate() { "No Predecessor for the region start basic block!"); assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!"); - assert(EndBB->getTerminator() && "Terminator removed from EndBB!"); PrevBB->getTerminator()->eraseFromParent(); - EndBB->getTerminator()->eraseFromParent(); moveBBContents(*StartBB, *PrevBB); BasicBlock *PlacementBB = PrevBB; if (StartBB != EndBB) PlacementBB = EndBB; - moveBBContents(*FollowBB, *PlacementBB); + if (!EndsInBranch && PlacementBB->getUniqueSuccessor() != nullptr) { + assert(FollowBB != nullptr && "FollowBB for Candidate is not defined!"); + assert(PlacementBB->getTerminator() && "Terminator removed from EndBB!"); + PlacementBB->getTerminator()->eraseFromParent(); + moveBBContents(*FollowBB, *PlacementBB); + PlacementBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB); + FollowBB->eraseFromParent(); + } PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB); - PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB); StartBB->eraseFromParent(); - FollowBB->eraseFromParent(); // Make sure to save changes back to the StartBB. StartBB = PrevBB; @@ -261,8 +334,9 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) { // division instruction for targets that have a native division instruction. // To be overly conservative, we only add 1 to the number of instructions for // each division instruction. - for (Instruction &I : *StartBB) { - switch (I.getOpcode()) { + for (IRInstructionData &ID : *Candidate) { + Instruction *I = ID.Inst; + switch (I->getOpcode()) { case Instruction::FDiv: case Instruction::FRem: case Instruction::SDiv: @@ -272,7 +346,7 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) { Benefit += 1; break; default: - Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); + Benefit += TTI.getInstructionCost(I, TargetTransformInfo::TCK_CodeSize); break; } } @@ -373,8 +447,24 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group, unsigned FunctionNameSuffix) { assert(!Group.OutlinedFunction && "Function is already defined!"); + Type *RetTy = Type::getVoidTy(M.getContext()); + // All extracted functions _should_ have the same return type at this point + // since the similarity identifier ensures that all branches outside of the + // region occur in the same place. + + // NOTE: Should we ever move to the model that uses a switch at every point + // needed, meaning that we could branch within the region or out, it is + // possible that we will need to switch to using the most general case all of + // the time. + for (OutlinableRegion *R : Group.Regions) { + Type *ExtractedFuncType = R->ExtractedFunction->getReturnType(); + if ((RetTy->isVoidTy() && !ExtractedFuncType->isVoidTy()) || + (RetTy->isIntegerTy(1) && ExtractedFuncType->isIntegerTy(16))) + RetTy = ExtractedFuncType; + } + Group.OutlinedFunctionType = FunctionType::get( - Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false); + RetTy, Group.ArgumentTypes, false); // These functions will only be called from within the same module, so // we can set an internal linkage. @@ -430,21 +520,23 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group, /// /// \param [in] Old - The function to move the basic blocks from. /// \param [in] New - The function to move the basic blocks to. -/// \returns the first return block for the function in New. -static BasicBlock *moveFunctionData(Function &Old, Function &New) { - Function::iterator CurrBB, NextBB, FinalBB; - BasicBlock *NewEnd = nullptr; - std::vector<Instruction *> DebugInsts; - for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB; - CurrBB = NextBB) { - NextBB = std::next(CurrBB); - CurrBB->removeFromParent(); - CurrBB->insertInto(&New); - Instruction *I = CurrBB->getTerminator(); - if (isa<ReturnInst>(I)) - NewEnd = &(*CurrBB); - - for (Instruction &Val : *CurrBB) { +/// \param [out] NewEnds - The return blocks of the new overall function. +static void moveFunctionData(Function &Old, Function &New, + DenseMap<Value *, BasicBlock *> &NewEnds) { + for (BasicBlock &CurrBB : llvm::make_early_inc_range(Old)) { + CurrBB.removeFromParent(); + CurrBB.insertInto(&New); + Instruction *I = CurrBB.getTerminator(); + + // For each block we find a return instruction is, it is a potential exit + // path for the function. We keep track of each block based on the return + // value here. + if (ReturnInst *RI = dyn_cast<ReturnInst>(I)) + NewEnds.insert(std::make_pair(RI->getReturnValue(), &CurrBB)); + + std::vector<Instruction *> DebugInsts; + + for (Instruction &Val : CurrBB) { // We must handle the scoping of called functions differently than // other outlined instructions. if (!isa<CallInst>(&Val)) { @@ -476,8 +568,7 @@ static BasicBlock *moveFunctionData(Function &Old, Function &New) { I->eraseFromParent(); } - assert(NewEnd && "No return instruction for new function?"); - return NewEnd; + assert(NewEnds.size() > 0 && "No return instruction for new function?"); } /// Find the the constants that will need to be lifted into arguments @@ -664,11 +755,22 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, // function to account for the extracted constants, we have two different // counters as we find extracted arguments, and as we come across overall // arguments. + + // Additionally, in our first pass, for the first extracted function, + // we find argument locations for the canonical value numbering. This + // numbering overrides any discovered location for the extracted code. for (unsigned InputVal : InputGVNs) { + Optional<unsigned> CanonicalNumberOpt = C.getCanonicalNum(InputVal); + assert(CanonicalNumberOpt.hasValue() && "Canonical number not found?"); + unsigned CanonicalNumber = CanonicalNumberOpt.getValue(); + Optional<Value *> InputOpt = C.fromGVN(InputVal); assert(InputOpt.hasValue() && "Global value number not found?"); Value *Input = InputOpt.getValue(); + DenseMap<unsigned, unsigned>::iterator AggArgIt = + Group.CanonicalNumberToAggArg.find(CanonicalNumber); + if (!Group.InputTypesSet) { Group.ArgumentTypes.push_back(Input->getType()); // If the input value has a swifterr attribute, make sure to mark the @@ -684,17 +786,34 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, // Check if we have a constant. If we do add it to the overall argument // number to Constant map for the region, and continue to the next input. if (Constant *CST = dyn_cast<Constant>(Input)) { - Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST)); + if (AggArgIt != Group.CanonicalNumberToAggArg.end()) + Region.AggArgToConstant.insert(std::make_pair(AggArgIt->second, CST)); + else { + Group.CanonicalNumberToAggArg.insert( + std::make_pair(CanonicalNumber, TypeIndex)); + Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST)); + } TypeIndex++; continue; } // It is not a constant, we create the mapping from extracted argument list - // to the overall argument list. + // to the overall argument list, using the canonical location, if it exists. assert(ArgInputs.count(Input) && "Input cannot be found!"); - Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex)); - Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex)); + if (AggArgIt != Group.CanonicalNumberToAggArg.end()) { + if (OriginalIndex != AggArgIt->second) + Region.ChangedArgOrder = true; + Region.ExtractedArgToAgg.insert( + std::make_pair(OriginalIndex, AggArgIt->second)); + Region.AggArgToExtracted.insert( + std::make_pair(AggArgIt->second, OriginalIndex)); + } else { + Group.CanonicalNumberToAggArg.insert( + std::make_pair(CanonicalNumber, TypeIndex)); + Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex)); + Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex)); + } OriginalIndex++; TypeIndex++; } @@ -718,10 +837,41 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, /// \param [in] Outputs - The values found by the code extractor. static void findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, - ArrayRef<Value *> Outputs) { + SetVector<Value *> &Outputs) { OutlinableGroup &Group = *Region.Parent; IRSimilarityCandidate &C = *Region.Candidate; + SmallVector<BasicBlock *> BE; + DenseSet<BasicBlock *> BBSet; + C.getBasicBlocks(BBSet, BE); + + // Find the exits to the region. + SmallPtrSet<BasicBlock *, 1> Exits; + for (BasicBlock *Block : BE) + for (BasicBlock *Succ : successors(Block)) + if (!BBSet.contains(Succ)) + Exits.insert(Succ); + + // After determining which blocks exit to PHINodes, we add these PHINodes to + // the set of outputs to be processed. We also check the incoming values of + // the PHINodes for whether they should no longer be considered outputs. + for (BasicBlock *ExitBB : Exits) { + for (PHINode &PN : ExitBB->phis()) { + // Find all incoming values from the outlining region. + SmallVector<unsigned, 2> IncomingVals; + for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx) + if (BBSet.contains(PN.getIncomingBlock(Idx))) + IncomingVals.push_back(Idx); + + // Do not process PHI if there is one (or fewer) predecessor from region. + if (IncomingVals.size() <= 1) + continue; + + Region.IgnoreRegion = true; + return; + } + } + // This counts the argument number in the extracted function. unsigned OriginalIndex = Region.NumExtractedInputs; @@ -797,7 +947,7 @@ void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region, // Map the outputs found by the CodeExtractor to the arguments found for // the overall function. - findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef()); + findExtractedOutputToOverallOutputMapping(Region, Outputs); } /// Replace the extracted function in the Region with a call to the overall @@ -820,9 +970,10 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { assert(AggFunc && "Function to replace with is nullptr?"); // If the arguments are the same size, there are not values that need to be - // made argument, or different output registers to handle. We can simply - // replace the called function in this case. - if (AggFunc->arg_size() == Call->arg_size()) { + // made into an argument, the argument ordering has not been change, or + // different output registers to handle. We can simply replace the called + // function in this case. + if (!Region.ChangedArgOrder && AggFunc->arg_size() == Call->arg_size()) { LLVM_DEBUG(dbgs() << "Replace call to " << *Call << " with call to " << *AggFunc << " with same number of arguments\n"); Call->setCalledFunction(AggFunc); @@ -895,6 +1046,9 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { // Transfer any debug information. Call->setDebugLoc(Region.Call->getDebugLoc()); + // Since our output may determine which branch we go to, we make sure to + // propogate this new call value through the module. + OldCall->replaceAllUsesWith(Call); // Remove the old instruction. OldCall->eraseFromParent(); @@ -913,13 +1067,23 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { // region with the arguments of the function for an OutlinableGroup. // /// \param [in] Region - The region of extracted code to be changed. -/// \param [in,out] OutputBB - The BasicBlock for the output stores for this +/// \param [in,out] OutputBBs - The BasicBlock for the output stores for this /// region. -static void replaceArgumentUses(OutlinableRegion &Region, - BasicBlock *OutputBB) { +/// \param [in] FirstFunction - A flag to indicate whether we are using this +/// function to define the overall outlined function for all the regions, or +/// if we are operating on one of the following regions. +static void +replaceArgumentUses(OutlinableRegion &Region, + DenseMap<Value *, BasicBlock *> &OutputBBs, + bool FirstFunction = false) { OutlinableGroup &Group = *Region.Parent; assert(Region.ExtractedFunction && "Region has no extracted function?"); + Function *DominatingFunction = Region.ExtractedFunction; + if (FirstFunction) + DominatingFunction = Group.OutlinedFunction; + DominatorTree DT(*DominatingFunction); + for (unsigned ArgIdx = 0; ArgIdx < Region.ExtractedFunction->arg_size(); ArgIdx++) { assert(Region.ExtractedArgToAgg.find(ArgIdx) != @@ -946,11 +1110,53 @@ static void replaceArgumentUses(OutlinableRegion &Region, assert(InstAsUser && "User is nullptr!"); Instruction *I = cast<Instruction>(InstAsUser); - I->setDebugLoc(DebugLoc()); - LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " - << *OutputBB << "\n"); + BasicBlock *BB = I->getParent(); + SmallVector<BasicBlock *, 4> Descendants; + DT.getDescendants(BB, Descendants); + bool EdgeAdded = false; + if (Descendants.size() == 0) { + EdgeAdded = true; + DT.insertEdge(&DominatingFunction->getEntryBlock(), BB); + DT.getDescendants(BB, Descendants); + } + + // Iterate over the following blocks, looking for return instructions, + // if we find one, find the corresponding output block for the return value + // and move our store instruction there. + for (BasicBlock *DescendBB : Descendants) { + ReturnInst *RI = dyn_cast<ReturnInst>(DescendBB->getTerminator()); + if (!RI) + continue; + Value *RetVal = RI->getReturnValue(); + auto VBBIt = OutputBBs.find(RetVal); + assert(VBBIt != OutputBBs.end() && "Could not find output value!"); + + // If this is storing a PHINode, we must make sure it is included in the + // overall function. + StoreInst *SI = cast<StoreInst>(I); + + Value *ValueOperand = SI->getValueOperand(); + + StoreInst *NewI = cast<StoreInst>(I->clone()); + NewI->setDebugLoc(DebugLoc()); + BasicBlock *OutputBB = VBBIt->second; + OutputBB->getInstList().push_back(NewI); + LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " + << *OutputBB << "\n"); - I->moveBefore(*OutputBB, OutputBB->end()); + if (FirstFunction) + continue; + Value *CorrVal = + Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); + assert(CorrVal && "Value is nullptr?"); + NewI->setOperand(0, CorrVal); + } + + // If we added an edge for basic blocks without a predecessor, we remove it + // here. + if (EdgeAdded) + DT.deleteEdge(&DominatingFunction->getEntryBlock(), BB); + I->eraseFromParent(); LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function " << *Region.ExtractedFunction << " with " << *AggArg @@ -990,69 +1196,53 @@ void replaceConstants(OutlinableRegion &Region) { } } -/// For the given function, find all the nondebug or lifetime instructions, -/// and return them as a vector. Exclude any blocks in \p ExludeBlocks. -/// -/// \param [in] F - The function we collect the instructions from. -/// \param [in] ExcludeBlocks - BasicBlocks to ignore. -/// \returns the list of instructions extracted. -static std::vector<Instruction *> -collectRelevantInstructions(Function &F, - DenseSet<BasicBlock *> &ExcludeBlocks) { - std::vector<Instruction *> RelevantInstructions; - - for (BasicBlock &BB : F) { - if (ExcludeBlocks.contains(&BB)) - continue; - - for (Instruction &Inst : BB) { - if (Inst.isLifetimeStartOrEnd()) - continue; - if (isa<DbgInfoIntrinsic>(Inst)) - continue; - - RelevantInstructions.push_back(&Inst); - } - } - - return RelevantInstructions; -} - /// It is possible that there is a basic block that already performs the same /// stores. This returns a duplicate block, if it exists /// -/// \param OutputBB [in] the block we are looking for a duplicate of. +/// \param OutputBBs [in] the blocks we are looking for a duplicate of. /// \param OutputStoreBBs [in] The existing output blocks. /// \returns an optional value with the number output block if there is a match. -Optional<unsigned> -findDuplicateOutputBlock(BasicBlock *OutputBB, - ArrayRef<BasicBlock *> OutputStoreBBs) { +Optional<unsigned> findDuplicateOutputBlock( + DenseMap<Value *, BasicBlock *> &OutputBBs, + std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) { - bool WrongInst = false; - bool WrongSize = false; + bool Mismatch = false; unsigned MatchingNum = 0; - for (BasicBlock *CompBB : OutputStoreBBs) { - WrongInst = false; - if (CompBB->size() - 1 != OutputBB->size()) { - WrongSize = true; - MatchingNum++; - continue; - } - - WrongSize = false; - BasicBlock::iterator NIt = OutputBB->begin(); - for (Instruction &I : *CompBB) { - if (isa<BranchInst>(&I)) - continue; + // We compare the new set output blocks to the other sets of output blocks. + // If they are the same number, and have identical instructions, they are + // considered to be the same. + for (DenseMap<Value *, BasicBlock *> &CompBBs : OutputStoreBBs) { + Mismatch = false; + for (std::pair<Value *, BasicBlock *> &VToB : CompBBs) { + DenseMap<Value *, BasicBlock *>::iterator OutputBBIt = + OutputBBs.find(VToB.first); + if (OutputBBIt == OutputBBs.end()) { + Mismatch = true; + break; + } - if (!I.isIdenticalTo(&(*NIt))) { - WrongInst = true; + BasicBlock *CompBB = VToB.second; + BasicBlock *OutputBB = OutputBBIt->second; + if (CompBB->size() - 1 != OutputBB->size()) { + Mismatch = true; break; } - NIt++; + BasicBlock::iterator NIt = OutputBB->begin(); + for (Instruction &I : *CompBB) { + if (isa<BranchInst>(&I)) + continue; + + if (!I.isIdenticalTo(&(*NIt))) { + Mismatch = true; + break; + } + + NIt++; + } } - if (!WrongInst && !WrongSize) + + if (!Mismatch) return MatchingNum; MatchingNum++; @@ -1061,95 +1251,130 @@ findDuplicateOutputBlock(BasicBlock *OutputBB, return None; } +/// Remove empty output blocks from the outlined region. +/// +/// \param BlocksToPrune - Mapping of return values output blocks for the \p +/// Region. +/// \param Region - The OutlinableRegion we are analyzing. +static bool +analyzeAndPruneOutputBlocks(DenseMap<Value *, BasicBlock *> &BlocksToPrune, + OutlinableRegion &Region) { + bool AllRemoved = true; + Value *RetValueForBB; + BasicBlock *NewBB; + SmallVector<Value *, 4> ToRemove; + // Iterate over the output blocks created in the outlined section. + for (std::pair<Value *, BasicBlock *> &VtoBB : BlocksToPrune) { + RetValueForBB = VtoBB.first; + NewBB = VtoBB.second; + + // If there are no instructions, we remove it from the module, and also + // mark the value for removal from the return value to output block mapping. + if (NewBB->size() == 0) { + NewBB->eraseFromParent(); + ToRemove.push_back(RetValueForBB); + continue; + } + + // Mark that we could not remove all the blocks since they were not all + // empty. + AllRemoved = false; + } + + // Remove the return value from the mapping. + for (Value *V : ToRemove) + BlocksToPrune.erase(V); + + // Mark the region as having the no output scheme. + if (AllRemoved) + Region.OutputBlockNum = -1; + + return AllRemoved; +} + /// For the outlined section, move needed the StoreInsts for the output /// registers into their own block. Then, determine if there is a duplicate /// output block already created. /// /// \param [in] OG - The OutlinableGroup of regions to be outlined. /// \param [in] Region - The OutlinableRegion that is being analyzed. -/// \param [in,out] OutputBB - the block that stores for this region will be +/// \param [in,out] OutputBBs - the blocks that stores for this region will be /// placed in. -/// \param [in] EndBB - the final block of the extracted function. +/// \param [in] EndBBs - the final blocks of the extracted function. /// \param [in] OutputMappings - OutputMappings the mapping of values that have /// been replaced by a new output value. /// \param [in,out] OutputStoreBBs - The existing output blocks. -static void -alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region, - BasicBlock *OutputBB, BasicBlock *EndBB, - const DenseMap<Value *, Value *> &OutputMappings, - std::vector<BasicBlock *> &OutputStoreBBs) { - DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(), - Region.GVNStores.end()); - - // We iterate over the instructions in the extracted function, and find the - // global value number of the instructions. If we find a value that should - // be contained in a store, we replace the uses of the value with the value - // from the overall function, so that the store is storing the correct - // value from the overall function. - DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(), - OutputStoreBBs.end()); - ExcludeBBs.insert(OutputBB); - std::vector<Instruction *> ExtractedFunctionInsts = - collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs); - std::vector<Instruction *> OverallFunctionInsts = - collectRelevantInstructions(*OG.OutlinedFunction, ExcludeBBs); - - assert(ExtractedFunctionInsts.size() == OverallFunctionInsts.size() && - "Number of relevant instructions not equal!"); - - unsigned NumInstructions = ExtractedFunctionInsts.size(); - for (unsigned Idx = 0; Idx < NumInstructions; Idx++) { - Value *V = ExtractedFunctionInsts[Idx]; - - if (OutputMappings.find(V) != OutputMappings.end()) - V = OutputMappings.find(V)->second; - Optional<unsigned> GVN = Region.Candidate->getGVN(V); - - // If we have found one of the stored values for output, replace the value - // with the corresponding one from the overall function. - if (GVN.hasValue() && ValuesToFind.erase(GVN.getValue())) { - V->replaceAllUsesWith(OverallFunctionInsts[Idx]); - if (ValuesToFind.size() == 0) - break; - } - - if (ValuesToFind.size() == 0) - break; - } - - assert(ValuesToFind.size() == 0 && "Not all store values were handled!"); - - // If the size of the block is 0, then there are no stores, and we do not - // need to save this block. - if (OutputBB->size() == 0) { - Region.OutputBlockNum = -1; - OutputBB->eraseFromParent(); +static void alignOutputBlockWithAggFunc( + OutlinableGroup &OG, OutlinableRegion &Region, + DenseMap<Value *, BasicBlock *> &OutputBBs, + DenseMap<Value *, BasicBlock *> &EndBBs, + const DenseMap<Value *, Value *> &OutputMappings, + std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) { + // If none of the output blocks have any instructions, this means that we do + // not have to determine if it matches any of the other output schemes, and we + // don't have to do anything else. + if (analyzeAndPruneOutputBlocks(OutputBBs, Region)) return; - } - // Determine is there is a duplicate block. + // Determine is there is a duplicate set of blocks. Optional<unsigned> MatchingBB = - findDuplicateOutputBlock(OutputBB, OutputStoreBBs); + findDuplicateOutputBlock(OutputBBs, OutputStoreBBs); - // If there is, we remove the new output block. If it does not, - // we add it to our list of output blocks. + // If there is, we remove the new output blocks. If it does not, + // we add it to our list of sets of output blocks. if (MatchingBB.hasValue()) { LLVM_DEBUG(dbgs() << "Set output block for region in function" << Region.ExtractedFunction << " to " << MatchingBB.getValue()); Region.OutputBlockNum = MatchingBB.getValue(); - OutputBB->eraseFromParent(); + for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) + VtoBB.second->eraseFromParent(); return; } Region.OutputBlockNum = OutputStoreBBs.size(); - LLVM_DEBUG(dbgs() << "Create output block for region in" - << Region.ExtractedFunction << " to " - << *OutputBB); - OutputStoreBBs.push_back(OutputBB); - BranchInst::Create(EndBB, OutputBB); + Value *RetValueForBB; + BasicBlock *NewBB; + OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>()); + for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) { + RetValueForBB = VtoBB.first; + NewBB = VtoBB.second; + DenseMap<Value *, BasicBlock *>::iterator VBBIt = + EndBBs.find(RetValueForBB); + LLVM_DEBUG(dbgs() << "Create output block for region in" + << Region.ExtractedFunction << " to " + << *NewBB); + BranchInst::Create(VBBIt->second, NewBB); + OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB)); + } +} + +/// Takes in a mapping, \p OldMap of ConstantValues to BasicBlocks, sorts keys, +/// before creating a basic block for each \p NewMap, and inserting into the new +/// block. Each BasicBlock is named with the scheme "<basename>_<key_idx>". +/// +/// \param OldMap [in] - The mapping to base the new mapping off of. +/// \param NewMap [out] - The output mapping using the keys of \p OldMap. +/// \param ParentFunc [in] - The function to put the new basic block in. +/// \param BaseName [in] - The start of the BasicBlock names to be appended to +/// by an index value. +static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap, + DenseMap<Value *, BasicBlock *> &NewMap, + Function *ParentFunc, Twine BaseName) { + unsigned Idx = 0; + std::vector<Value *> SortedKeys; + + getSortedConstantKeys(SortedKeys, OldMap); + + for (Value *RetVal : SortedKeys) { + BasicBlock *NewBB = BasicBlock::Create( + ParentFunc->getContext(), + Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)), + ParentFunc); + NewMap.insert(std::make_pair(RetVal, NewBB)); + } } /// Create the switch statement for outlined function to differentiate between @@ -1159,50 +1384,74 @@ alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region, /// matches the needed stores for the extracted section. /// \param [in] M - The module we are outlining from. /// \param [in] OG - The group of regions to be outlined. -/// \param [in] EndBB - The final block of the extracted function. +/// \param [in] EndBBs - The final blocks of the extracted function. /// \param [in,out] OutputStoreBBs - The existing output blocks. -void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB, - ArrayRef<BasicBlock *> OutputStoreBBs) { +void createSwitchStatement( + Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs, + std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) { // We only need the switch statement if there is more than one store // combination. if (OG.OutputGVNCombinations.size() > 1) { Function *AggFunc = OG.OutlinedFunction; - // Create a final block - BasicBlock *ReturnBlock = - BasicBlock::Create(M.getContext(), "final_block", AggFunc); - Instruction *Term = EndBB->getTerminator(); - Term->moveBefore(*ReturnBlock, ReturnBlock->end()); - // Put the switch statement in the old end basic block for the function with - // a fall through to the new return block - LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for " - << OutputStoreBBs.size() << "\n"); - SwitchInst *SwitchI = - SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), - ReturnBlock, OutputStoreBBs.size(), EndBB); - - unsigned Idx = 0; - for (BasicBlock *BB : OutputStoreBBs) { - SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), - BB); - Term = BB->getTerminator(); - Term->setSuccessor(0, ReturnBlock); - Idx++; + // Create a final block for each different return block. + DenseMap<Value *, BasicBlock *> ReturnBBs; + createAndInsertBasicBlocks(OG.EndBBs, ReturnBBs, AggFunc, "final_block"); + + for (std::pair<Value *, BasicBlock *> &RetBlockPair : ReturnBBs) { + std::pair<Value *, BasicBlock *> &OutputBlock = + *OG.EndBBs.find(RetBlockPair.first); + BasicBlock *ReturnBlock = RetBlockPair.second; + BasicBlock *EndBB = OutputBlock.second; + Instruction *Term = EndBB->getTerminator(); + // Move the return value to the final block instead of the original exit + // stub. + Term->moveBefore(*ReturnBlock, ReturnBlock->end()); + // Put the switch statement in the old end basic block for the function + // with a fall through to the new return block. + LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for " + << OutputStoreBBs.size() << "\n"); + SwitchInst *SwitchI = + SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), + ReturnBlock, OutputStoreBBs.size(), EndBB); + + unsigned Idx = 0; + for (DenseMap<Value *, BasicBlock *> &OutputStoreBB : OutputStoreBBs) { + DenseMap<Value *, BasicBlock *>::iterator OSBBIt = + OutputStoreBB.find(OutputBlock.first); + + if (OSBBIt == OutputStoreBB.end()) + continue; + + BasicBlock *BB = OSBBIt->second; + SwitchI->addCase( + ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), BB); + Term = BB->getTerminator(); + Term->setSuccessor(0, ReturnBlock); + Idx++; + } } return; } - // If there needs to be stores, move them from the output block to the end - // block to save on branching instructions. + // If there needs to be stores, move them from the output blocks to their + // corresponding ending block. if (OutputStoreBBs.size() == 1) { LLVM_DEBUG(dbgs() << "Move store instructions to the end block in " << *OG.OutlinedFunction << "\n"); - BasicBlock *OutputBlock = OutputStoreBBs[0]; - Instruction *Term = OutputBlock->getTerminator(); - Term->eraseFromParent(); - Term = EndBB->getTerminator(); - moveBBContents(*OutputBlock, *EndBB); - Term->moveBefore(*EndBB, EndBB->end()); - OutputBlock->eraseFromParent(); + DenseMap<Value *, BasicBlock *> OutputBlocks = OutputStoreBBs[0]; + for (std::pair<Value *, BasicBlock *> &VBPair : OutputBlocks) { + DenseMap<Value *, BasicBlock *>::iterator EndBBIt = + EndBBs.find(VBPair.first); + assert(EndBBIt != EndBBs.end() && "Could not find end block"); + BasicBlock *EndBB = EndBBIt->second; + BasicBlock *OutputBB = VBPair.second; + Instruction *Term = OutputBB->getTerminator(); + Term->eraseFromParent(); + Term = EndBB->getTerminator(); + moveBBContents(*OutputBB, *EndBB); + Term->moveBefore(*EndBB, EndBB->end()); + OutputBB->eraseFromParent(); + } } } @@ -1217,42 +1466,44 @@ void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB, /// set of stores needed for the different functions. /// \param [in,out] FuncsToRemove - Extracted functions to erase from module /// once outlining is complete. -static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup, - std::vector<BasicBlock *> &OutputStoreBBs, - std::vector<Function *> &FuncsToRemove) { +static void fillOverallFunction( + Module &M, OutlinableGroup &CurrentGroup, + std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs, + std::vector<Function *> &FuncsToRemove) { OutlinableRegion *CurrentOS = CurrentGroup.Regions[0]; // Move first extracted function's instructions into new function. LLVM_DEBUG(dbgs() << "Move instructions from " << *CurrentOS->ExtractedFunction << " to instruction " << *CurrentGroup.OutlinedFunction << "\n"); - - CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction, - *CurrentGroup.OutlinedFunction); + moveFunctionData(*CurrentOS->ExtractedFunction, + *CurrentGroup.OutlinedFunction, CurrentGroup.EndBBs); // Transfer the attributes from the function to the new function. - for (Attribute A : - CurrentOS->ExtractedFunction->getAttributes().getFnAttributes()) + for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttrs()) CurrentGroup.OutlinedFunction->addFnAttr(A); - // Create an output block for the first extracted function. - BasicBlock *NewBB = BasicBlock::Create( - M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)), - CurrentGroup.OutlinedFunction); + // Create a new set of output blocks for the first extracted function. + DenseMap<Value *, BasicBlock *> NewBBs; + createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs, + CurrentGroup.OutlinedFunction, "output_block_0"); CurrentOS->OutputBlockNum = 0; - replaceArgumentUses(*CurrentOS, NewBB); + replaceArgumentUses(*CurrentOS, NewBBs, true); replaceConstants(*CurrentOS); - // If the new basic block has no new stores, we can erase it from the module. - // It it does, we create a branch instruction to the last basic block from the - // new one. - if (NewBB->size() == 0) { - CurrentOS->OutputBlockNum = -1; - NewBB->eraseFromParent(); - } else { - BranchInst::Create(CurrentGroup.EndBB, NewBB); - OutputStoreBBs.push_back(NewBB); + // We first identify if any output blocks are empty, if they are we remove + // them. We then create a branch instruction to the basic block to the return + // block for the function for each non empty output block. + if (!analyzeAndPruneOutputBlocks(NewBBs, *CurrentOS)) { + OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>()); + for (std::pair<Value *, BasicBlock *> &VToBB : NewBBs) { + DenseMap<Value *, BasicBlock *>::iterator VBBIt = + CurrentGroup.EndBBs.find(VToBB.first); + BasicBlock *EndBB = VBBIt->second; + BranchInst::Create(EndBB, VToBB.second); + OutputStoreBBs.back().insert(VToBB); + } } // Replace the call to the extracted function with the outlined function. @@ -1268,25 +1519,28 @@ void IROutliner::deduplicateExtractedSections( std::vector<Function *> &FuncsToRemove, unsigned &OutlinedFunctionNum) { createFunction(M, CurrentGroup, OutlinedFunctionNum); - std::vector<BasicBlock *> OutputStoreBBs; + std::vector<DenseMap<Value *, BasicBlock *>> OutputStoreBBs; OutlinableRegion *CurrentOS; fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove); + std::vector<Value *> SortedKeys; for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) { CurrentOS = CurrentGroup.Regions[Idx]; AttributeFuncs::mergeAttributesForOutlining(*CurrentGroup.OutlinedFunction, *CurrentOS->ExtractedFunction); - // Create a new BasicBlock to hold the needed store instructions. - BasicBlock *NewBB = BasicBlock::Create( - M.getContext(), "output_block_" + std::to_string(Idx), - CurrentGroup.OutlinedFunction); - replaceArgumentUses(*CurrentOS, NewBB); + // Create a set of BasicBlocks, one for each return block, to hold the + // needed store instructions. + DenseMap<Value *, BasicBlock *> NewBBs; + createAndInsertBasicBlocks( + CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction, + "output_block_" + Twine(static_cast<unsigned>(Idx))); - alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB, - CurrentGroup.EndBB, OutputMappings, + replaceArgumentUses(*CurrentOS, NewBBs); + alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs, + CurrentGroup.EndBBs, OutputMappings, OutputStoreBBs); CurrentOS->Call = replaceCalledFunction(M, *CurrentOS); @@ -1294,11 +1548,78 @@ void IROutliner::deduplicateExtractedSections( } // Create a switch statement to handle the different output schemes. - createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs); + createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBBs, OutputStoreBBs); OutlinedFunctionNum++; } +/// Checks that the next instruction in the InstructionDataList matches the +/// next instruction in the module. If they do not, there could be the +/// possibility that extra code has been inserted, and we must ignore it. +/// +/// \param ID - The IRInstructionData to check the next instruction of. +/// \returns true if the InstructionDataList and actual instruction match. +static bool nextIRInstructionDataMatchesNextInst(IRInstructionData &ID) { + // We check if there is a discrepancy between the InstructionDataList + // and the actual next instruction in the module. If there is, it means + // that an extra instruction was added, likely by the CodeExtractor. + + // Since we do not have any similarity data about this particular + // instruction, we cannot confidently outline it, and must discard this + // candidate. + IRInstructionDataList::iterator NextIDIt = std::next(ID.getIterator()); + Instruction *NextIDLInst = NextIDIt->Inst; + Instruction *NextModuleInst = nullptr; + if (!ID.Inst->isTerminator()) + NextModuleInst = ID.Inst->getNextNonDebugInstruction(); + else if (NextIDLInst != nullptr) + NextModuleInst = + &*NextIDIt->Inst->getParent()->instructionsWithoutDebug().begin(); + + if (NextIDLInst && NextIDLInst != NextModuleInst) + return false; + + return true; +} + +bool IROutliner::isCompatibleWithAlreadyOutlinedCode( + const OutlinableRegion &Region) { + IRSimilarityCandidate *IRSC = Region.Candidate; + unsigned StartIdx = IRSC->getStartIdx(); + unsigned EndIdx = IRSC->getEndIdx(); + + // A check to make sure that we are not about to attempt to outline something + // that has already been outlined. + for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++) + if (Outlined.contains(Idx)) + return false; + + // We check if the recorded instruction matches the actual next instruction, + // if it does not, we fix it in the InstructionDataList. + if (!Region.Candidate->backInstruction()->isTerminator()) { + Instruction *NewEndInst = + Region.Candidate->backInstruction()->getNextNonDebugInstruction(); + assert(NewEndInst && "Next instruction is a nullptr?"); + if (Region.Candidate->end()->Inst != NewEndInst) { + IRInstructionDataList *IDL = Region.Candidate->front()->IDL; + IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate()) + IRInstructionData(*NewEndInst, + InstructionClassifier.visit(*NewEndInst), *IDL); + + // Insert the first IRInstructionData of the new region after the + // last IRInstructionData of the IRSimilarityCandidate. + IDL->insert(Region.Candidate->end(), *NewEndIRID); + } + } + + return none_of(*IRSC, [this](IRInstructionData &ID) { + if (!nextIRInstructionDataMatchesNextInst(ID)) + return true; + + return !this->InstructionClassifier.visit(ID.Inst); + }); +} + void IROutliner::pruneIncompatibleRegions( std::vector<IRSimilarityCandidate> &CandidateVec, OutlinableGroup &CurrentGroup) { @@ -1310,6 +1631,15 @@ void IROutliner::pruneIncompatibleRegions( return LHS.getStartIdx() < RHS.getStartIdx(); }); + IRSimilarityCandidate &FirstCandidate = CandidateVec[0]; + // Since outlining a call and a branch instruction will be the same as only + // outlinining a call instruction, we ignore it as a space saving. + if (FirstCandidate.getLength() == 2) { + if (isa<CallInst>(FirstCandidate.front()->Inst) && + isa<BranchInst>(FirstCandidate.back()->Inst)) + return; + } + unsigned CurrentEndIdx = 0; for (IRSimilarityCandidate &IRSC : CandidateVec) { PreviouslyOutlined = false; @@ -1325,9 +1655,13 @@ void IROutliner::pruneIncompatibleRegions( if (PreviouslyOutlined) continue; - // TODO: If in the future we can outline across BasicBlocks, we will need to - // check all BasicBlocks contained in the region. - if (IRSC.getStartBB()->hasAddressTaken()) + // Check over the instructions, and if the basic block has its address + // taken for use somewhere else, we do not outline that block. + bool BBHasAddressTaken = any_of(IRSC, [](IRInstructionData &ID){ + return ID.Inst->getParent()->hasAddressTaken(); + }); + + if (BBHasAddressTaken) continue; if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() && @@ -1340,16 +1674,9 @@ void IROutliner::pruneIncompatibleRegions( continue; bool BadInst = any_of(IRSC, [this](IRInstructionData &ID) { - // We check if there is a discrepancy between the InstructionDataList - // and the actual next instruction in the module. If there is, it means - // that an extra instruction was added, likely by the CodeExtractor. - - // Since we do not have any similarity data about this particular - // instruction, we cannot confidently outline it, and must discard this - // candidate. - if (std::next(ID.getIterator())->Inst != - ID.Inst->getNextNonDebugInstruction()) + if (!nextIRInstructionDataMatchesNextInst(ID)) return true; + return !this->InstructionClassifier.visit(ID.Inst); }); @@ -1416,10 +1743,33 @@ static InstructionCost findCostForOutputBlocks(Module &M, OutlinableGroup &CurrentGroup, TargetTransformInfo &TTI) { InstructionCost OutputCost = 0; + unsigned NumOutputBranches = 0; + + IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; + DenseSet<BasicBlock *> CandidateBlocks; + Candidate.getBasicBlocks(CandidateBlocks); + + // Count the number of different output branches that point to blocks outside + // of the region. + DenseSet<BasicBlock *> FoundBlocks; + for (IRInstructionData &ID : Candidate) { + if (!isa<BranchInst>(ID.Inst)) + continue; + + for (Value *V : ID.OperVals) { + BasicBlock *BB = static_cast<BasicBlock *>(V); + DenseSet<BasicBlock *>::iterator CBIt = CandidateBlocks.find(BB); + if (CBIt != CandidateBlocks.end() || FoundBlocks.contains(BB)) + continue; + FoundBlocks.insert(BB); + NumOutputBranches++; + } + } + + CurrentGroup.BranchesToOutside = NumOutputBranches; for (const ArrayRef<unsigned> &OutputUse : CurrentGroup.OutputGVNCombinations) { - IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; for (unsigned GVN : OutputUse) { Optional<Value *> OV = Candidate.fromGVN(GVN); assert(OV.hasValue() && "Could not find value for GVN?"); @@ -1434,14 +1784,14 @@ static InstructionCost findCostForOutputBlocks(Module &M, LLVM_DEBUG(dbgs() << "Adding: " << StoreCost << " instructions to cost for output of type " << *V->getType() << "\n"); - OutputCost += StoreCost; + OutputCost += StoreCost * NumOutputBranches; } InstructionCost BranchCost = TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize); LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for" << " a branch instruction\n"); - OutputCost += BranchCost; + OutputCost += BranchCost * NumOutputBranches; } // If there is more than one output scheme, we must have a comparison and @@ -1460,7 +1810,7 @@ static InstructionCost findCostForOutputBlocks(Module &M, LLVM_DEBUG(dbgs() << "Adding: " << TotalCost << " instructions for each switch case for each different" << " output path in a function\n"); - OutputCost += TotalCost; + OutputCost += TotalCost * NumOutputBranches; } return OutputCost; @@ -1548,13 +1898,12 @@ void IROutliner::updateOutputMapping(OutlinableRegion &Region, bool IROutliner::extractSection(OutlinableRegion &Region) { SetVector<Value *> ArgInputs, Outputs, SinkCands; - Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands); - assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!"); - assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!"); + BasicBlock *InitialStart = Region.StartBB; Function *OrigF = Region.StartBB->getParent(); CodeExtractorAnalysisCache CEAC(*OrigF); - Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC); + Region.ExtractedFunction = + Region.CE->extractCodeRegion(CEAC, ArgInputs, Outputs); // If the extraction was successful, find the BasicBlock, and reassign the // OutlinableRegion blocks @@ -1565,7 +1914,23 @@ bool IROutliner::extractSection(OutlinableRegion &Region) { return false; } - BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor(); + // Get the block containing the called branch, and reassign the blocks as + // necessary. If the original block still exists, it is because we ended on + // a branch instruction, and so we move the contents into the block before + // and assign the previous block correctly. + User *InstAsUser = Region.ExtractedFunction->user_back(); + BasicBlock *RewrittenBB = cast<Instruction>(InstAsUser)->getParent(); + Region.PrevBB = RewrittenBB->getSinglePredecessor(); + assert(Region.PrevBB && "PrevBB is nullptr?"); + if (Region.PrevBB == InitialStart) { + BasicBlock *NewPrev = InitialStart->getSinglePredecessor(); + Instruction *BI = NewPrev->getTerminator(); + BI->eraseFromParent(); + moveBBContents(*InitialStart, *NewPrev); + Region.PrevBB = NewPrev; + InitialStart->eraseFromParent(); + } + Region.StartBB = RewrittenBB; Region.EndBB = RewrittenBB; @@ -1608,6 +1973,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) { unsigned IROutliner::doOutline(Module &M) { // Find the possible similarity sections. + InstructionClassifier.EnableBranches = !DisableBranches; IRSimilarityIdentifier &Identifier = getIRSI(M); SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity(); @@ -1622,12 +1988,17 @@ unsigned IROutliner::doOutline(Module &M) { return LHS[0].getLength() * LHS.size() > RHS[0].getLength() * RHS.size(); }); + // Creating OutlinableGroups for each SimilarityCandidate to be used in + // each of the following for loops to avoid making an allocator. + std::vector<OutlinableGroup> PotentialGroups(SimilarityCandidates.size()); DenseSet<unsigned> NotSame; - std::vector<Function *> FuncsToRemove; + std::vector<OutlinableGroup *> NegativeCostGroups; + std::vector<OutlinableRegion *> OutlinedRegions; // Iterate over the possible sets of similarity. + unsigned PotentialGroupIdx = 0; for (SimilarityGroup &CandidateVec : SimilarityCandidates) { - OutlinableGroup CurrentGroup; + OutlinableGroup &CurrentGroup = PotentialGroups[PotentialGroupIdx++]; // Remove entries that were previously outlined pruneIncompatibleRegions(CandidateVec, CurrentGroup); @@ -1649,20 +2020,31 @@ unsigned IROutliner::doOutline(Module &M) { // Create a CodeExtractor for each outlinable region. Identify inputs and // outputs for each section using the code extractor and create the argument // types for the Aggregate Outlining Function. - std::vector<OutlinableRegion *> OutlinedRegions; + OutlinedRegions.clear(); for (OutlinableRegion *OS : CurrentGroup.Regions) { // Break the outlinable region out of its parent BasicBlock into its own // BasicBlocks (see function implementation). OS->splitCandidate(); - std::vector<BasicBlock *> BE = {OS->StartBB}; + + // There's a chance that when the region is split, extra instructions are + // added to the region. This makes the region no longer viable + // to be split, so we ignore it for outlining. + if (!OS->CandidateSplit) + continue; + + SmallVector<BasicBlock *> BE; + DenseSet<BasicBlock *> BBSet; + OS->Candidate->getBasicBlocks(BBSet, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); findAddInputsOutputs(M, *OS, NotSame); if (!OS->IgnoreRegion) OutlinedRegions.push_back(OS); - else - OS->reattachCandidate(); + + // We recombine the blocks together now that we have gathered all the + // needed information. + OS->reattachCandidate(); } CurrentGroup.Regions = std::move(OutlinedRegions); @@ -1675,12 +2057,11 @@ unsigned IROutliner::doOutline(Module &M) { if (CostModel) findCostBenefit(M, CurrentGroup); - // If we are adhering to the cost model, reattach all the candidates + // If we are adhering to the cost model, skip those groups where the cost + // outweighs the benefits. if (CurrentGroup.Cost >= CurrentGroup.Benefit && CostModel) { - for (OutlinableRegion *OS : CurrentGroup.Regions) - OS->reattachCandidate(); - OptimizationRemarkEmitter &ORE = getORE( - *CurrentGroup.Regions[0]->Candidate->getFunction()); + OptimizationRemarkEmitter &ORE = + getORE(*CurrentGroup.Regions[0]->Candidate->getFunction()); ORE.emit([&]() { IRSimilarityCandidate *C = CurrentGroup.Regions[0]->Candidate; OptimizationRemarkMissed R(DEBUG_TYPE, "WouldNotDecreaseSize", @@ -1704,12 +2085,70 @@ unsigned IROutliner::doOutline(Module &M) { continue; } + NegativeCostGroups.push_back(&CurrentGroup); + } + + ExtractorAllocator.DestroyAll(); + + if (NegativeCostGroups.size() > 1) + stable_sort(NegativeCostGroups, + [](const OutlinableGroup *LHS, const OutlinableGroup *RHS) { + return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost; + }); + + std::vector<Function *> FuncsToRemove; + for (OutlinableGroup *CG : NegativeCostGroups) { + OutlinableGroup &CurrentGroup = *CG; + + OutlinedRegions.clear(); + for (OutlinableRegion *Region : CurrentGroup.Regions) { + // We check whether our region is compatible with what has already been + // outlined, and whether we need to ignore this item. + if (!isCompatibleWithAlreadyOutlinedCode(*Region)) + continue; + OutlinedRegions.push_back(Region); + } + + if (OutlinedRegions.size() < 2) + continue; + + // Reestimate the cost and benefit of the OutlinableGroup. Continue only if + // we are still outlining enough regions to make up for the added cost. + CurrentGroup.Regions = std::move(OutlinedRegions); + if (CostModel) { + CurrentGroup.Benefit = 0; + CurrentGroup.Cost = 0; + findCostBenefit(M, CurrentGroup); + if (CurrentGroup.Cost >= CurrentGroup.Benefit) + continue; + } + OutlinedRegions.clear(); + for (OutlinableRegion *Region : CurrentGroup.Regions) { + Region->splitCandidate(); + if (!Region->CandidateSplit) + continue; + OutlinedRegions.push_back(Region); + } + + CurrentGroup.Regions = std::move(OutlinedRegions); + if (CurrentGroup.Regions.size() < 2) { + for (OutlinableRegion *R : CurrentGroup.Regions) + R->reattachCandidate(); + continue; + } + LLVM_DEBUG(dbgs() << "Outlining regions with cost " << CurrentGroup.Cost << " and benefit " << CurrentGroup.Benefit << "\n"); // Create functions out of all the sections, and mark them as outlined. OutlinedRegions.clear(); for (OutlinableRegion *OS : CurrentGroup.Regions) { + SmallVector<BasicBlock *> BE; + DenseSet<BasicBlock *> BBSet; + OS->Candidate->getBasicBlocks(BBSet, BE); + OS->CE = new (ExtractorAllocator.Allocate()) + CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, + false, "outlined"); bool FunctionOutlined = extractSection(*OS); if (FunctionOutlined) { unsigned StartIdx = OS->Candidate->getStartIdx(); @@ -1767,6 +2206,7 @@ bool IROutliner::run(Module &M) { } // Pass Manager Boilerplate +namespace { class IROutlinerLegacyPass : public ModulePass { public: static char ID; @@ -1782,6 +2222,7 @@ public: bool runOnModule(Module &M) override; }; +} // namespace bool IROutlinerLegacyPass::runOnModule(Module &M) { if (skipModule(M)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp index 59260af88832..992c2b292e1e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp @@ -31,9 +31,11 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InlineOrder.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/ReplayInlineAdvisor.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h" @@ -96,9 +98,53 @@ static cl::opt<std::string> CGSCCInlineReplayFile( "cgscc-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc( "Optimization remarks file containing inline remarks to be replayed " - "by inlining from cgscc inline remarks."), + "by cgscc inlining."), cl::Hidden); +static cl::opt<ReplayInlinerSettings::Scope> CGSCCInlineReplayScope( + "cgscc-inline-replay-scope", + cl::init(ReplayInlinerSettings::Scope::Function), + cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", + "Replay on functions that have remarks associated " + "with them (default)"), + clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", + "Replay on the entire module")), + cl::desc("Whether inline replay should be applied to the entire " + "Module or just the Functions (default) that are present as " + "callers in remarks during cgscc inlining."), + cl::Hidden); + +static cl::opt<ReplayInlinerSettings::Fallback> CGSCCInlineReplayFallback( + "cgscc-inline-replay-fallback", + cl::init(ReplayInlinerSettings::Fallback::Original), + cl::values( + clEnumValN( + ReplayInlinerSettings::Fallback::Original, "Original", + "All decisions not in replay send to original advisor (default)"), + clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, + "AlwaysInline", "All decisions not in replay are inlined"), + clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", + "All decisions not in replay are not inlined")), + cl::desc( + "How cgscc inline replay treats sites that don't come from the replay. " + "Original: defers to original advisor, AlwaysInline: inline all sites " + "not in replay, NeverInline: inline no sites not in replay"), + cl::Hidden); + +static cl::opt<CallSiteFormat::Format> CGSCCInlineReplayFormat( + "cgscc-inline-replay-format", + cl::init(CallSiteFormat::Format::LineColumnDiscriminator), + cl::values( + clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), + clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", + "<Line Number>:<Column Number>"), + clEnumValN(CallSiteFormat::Format::LineDiscriminator, + "LineDiscriminator", "<Line Number>.<Discriminator>"), + clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, + "LineColumnDiscriminator", + "<Line Number>:<Column Number>.<Discriminator> (default)")), + cl::desc("How cgscc inline replay file is formatted"), cl::Hidden); + static cl::opt<bool> InlineEnablePriorityOrder( "inline-enable-priority-order", cl::Hidden, cl::init(false), cl::desc("Enable the priority inline order for the inliner")); @@ -463,7 +509,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, } ++NumInlined; - emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC); + emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC); // If inlining this function gave us any new call sites, throw them // onto our worklist to process. They are useful inline candidates. @@ -661,9 +707,12 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM, std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams()); if (!CGSCCInlineReplayFile.empty()) - OwnedAdvisor = std::make_unique<ReplayInlineAdvisor>( + OwnedAdvisor = getReplayInlineAdvisor( M, FAM, M.getContext(), std::move(OwnedAdvisor), - CGSCCInlineReplayFile, + ReplayInlinerSettings{CGSCCInlineReplayFile, + CGSCCInlineReplayScope, + CGSCCInlineReplayFallback, + {CGSCCInlineReplayFormat}}, /*EmitRemarks=*/true); return *OwnedAdvisor; @@ -674,153 +723,6 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM, return *IAA->getAdvisor(); } -template <typename T> class InlineOrder { -public: - using reference = T &; - using const_reference = const T &; - - virtual ~InlineOrder() {} - - virtual size_t size() = 0; - - virtual void push(const T &Elt) = 0; - - virtual T pop() = 0; - - virtual const_reference front() = 0; - - virtual void erase_if(function_ref<bool(T)> Pred) = 0; - - bool empty() { return !size(); } -}; - -template <typename T, typename Container = SmallVector<T, 16>> -class DefaultInlineOrder : public InlineOrder<T> { - using reference = T &; - using const_reference = const T &; - -public: - size_t size() override { return Calls.size() - FirstIndex; } - - void push(const T &Elt) override { Calls.push_back(Elt); } - - T pop() override { - assert(size() > 0); - return Calls[FirstIndex++]; - } - - const_reference front() override { - assert(size() > 0); - return Calls[FirstIndex]; - } - - void erase_if(function_ref<bool(T)> Pred) override { - Calls.erase(std::remove_if(Calls.begin() + FirstIndex, Calls.end(), Pred), - Calls.end()); - } - -private: - Container Calls; - size_t FirstIndex = 0; -}; - -class Priority { -public: - Priority(int Size) : Size(Size) {} - - static bool isMoreDesirable(const Priority &S1, const Priority &S2) { - return S1.Size < S2.Size; - } - - static Priority evaluate(CallBase *CB) { - Function *Callee = CB->getCalledFunction(); - return Priority(Callee->getInstructionCount()); - } - - int Size; -}; - -template <typename PriorityT> -class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> { - using T = std::pair<CallBase *, int>; - using HeapT = std::pair<CallBase *, PriorityT>; - using reference = T &; - using const_reference = const T &; - - static bool cmp(const HeapT &P1, const HeapT &P2) { - return PriorityT::isMoreDesirable(P2.second, P1.second); - } - - // A call site could become less desirable for inlining because of the size - // growth from prior inlining into the callee. This method is used to lazily - // update the desirability of a call site if it's decreasing. It is only - // called on pop() or front(), not every time the desirability changes. When - // the desirability of the front call site decreases, an updated one would be - // pushed right back into the heap. For simplicity, those cases where - // the desirability of a call site increases are ignored here. - void adjust() { - bool Changed = false; - do { - CallBase *CB = Heap.front().first; - const PriorityT PreviousGoodness = Heap.front().second; - const PriorityT CurrentGoodness = PriorityT::evaluate(CB); - Changed = PriorityT::isMoreDesirable(PreviousGoodness, CurrentGoodness); - if (Changed) { - std::pop_heap(Heap.begin(), Heap.end(), cmp); - Heap.pop_back(); - Heap.push_back({CB, CurrentGoodness}); - std::push_heap(Heap.begin(), Heap.end(), cmp); - } - } while (Changed); - } - -public: - size_t size() override { return Heap.size(); } - - void push(const T &Elt) override { - CallBase *CB = Elt.first; - const int InlineHistoryID = Elt.second; - const PriorityT Goodness = PriorityT::evaluate(CB); - - Heap.push_back({CB, Goodness}); - std::push_heap(Heap.begin(), Heap.end(), cmp); - InlineHistoryMap[CB] = InlineHistoryID; - } - - T pop() override { - assert(size() > 0); - adjust(); - - CallBase *CB = Heap.front().first; - T Result = std::make_pair(CB, InlineHistoryMap[CB]); - InlineHistoryMap.erase(CB); - std::pop_heap(Heap.begin(), Heap.end(), cmp); - Heap.pop_back(); - return Result; - } - - const_reference front() override { - assert(size() > 0); - adjust(); - - CallBase *CB = Heap.front().first; - return *InlineHistoryMap.find(CB); - } - - void erase_if(function_ref<bool(T)> Pred) override { - auto PredWrapper = [=](HeapT P) -> bool { - return Pred(std::make_pair(P.first, 0)); - }; - Heap.erase(std::remove_if(Heap.begin(), Heap.end(), PredWrapper), - Heap.end()); - std::make_heap(Heap.begin(), Heap.end(), cmp); - } - -private: - SmallVector<HeapT, 16> Heap; - DenseMap<CallBase *, int> InlineHistoryMap; -}; - PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR) { @@ -868,7 +770,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // incrementally maknig a single function grow in a super linear fashion. std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls; if (InlineEnablePriorityOrder) - Calls = std::make_unique<PriorityInlineOrder<Priority>>(); + Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>(); else Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>(); assert(Calls != nullptr && "Expected an initialized InlineOrder"); @@ -972,8 +874,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, continue; } - auto Advice = Advisor.getAdvice(*CB, OnlyMandatory); + std::unique_ptr<InlineAdvice> Advice = + Advisor.getAdvice(*CB, OnlyMandatory); + // Check whether we want to inline this callsite. + if (!Advice) + continue; + if (!Advice->isInliningRecommended()) { Advice->recordUnattemptedInlining(); continue; @@ -1104,6 +1011,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, UR.InlinedInternalEdges.insert({&N, OldC}); } InlinedCallees.clear(); + + // Invalidate analyses for this function now so that we don't have to + // invalidate analyses for all functions in this SCC later. + FAM.invalidate(F, PreservedAnalyses::none()); } // Now that we've finished inlining all of the calls across this SCC, delete @@ -1147,10 +1058,12 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (!Changed) return PreservedAnalyses::all(); + PreservedAnalyses PA; // Even if we change the IR, we update the core CGSCC data structures and so // can preserve the proxy to the function analysis manager. - PreservedAnalyses PA; PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); + // We have already invalidated all analyses on modified functions. + PA.preserveSet<AllAnalysesOn<Function>>(); return PA; } @@ -1173,7 +1086,11 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params, PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M, ModuleAnalysisManager &MAM) { auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M); - if (!IAA.tryCreate(Params, Mode, CGSCCInlineReplayFile)) { + if (!IAA.tryCreate(Params, Mode, + {CGSCCInlineReplayFile, + CGSCCInlineReplayScope, + CGSCCInlineReplayFallback, + {CGSCCInlineReplayFormat}})) { M.getContext().emitError( "Could not setup Inlining Advisor for the requested " "mode and/or options"); @@ -1192,10 +1109,39 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M, else MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations))); + + MPM.addPass(std::move(AfterCGMPM)); MPM.run(M, MAM); - IAA.clear(); + // Discard the InlineAdvisor, a subsequent inlining session should construct + // its own. + auto PA = PreservedAnalyses::all(); + PA.abandon<InlineAdvisorAnalysis>(); + return PA; +} - // The ModulePassManager has already taken care of invalidating analyses. - return PreservedAnalyses::all(); +void InlinerPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<InlinerPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + if (OnlyMandatory) + OS << "<only-mandatory>"; +} + +void ModuleInlinerWrapperPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + // Print some info about passes added to the wrapper. This is however + // incomplete as InlineAdvisorAnalysis part isn't included (which also depends + // on Params and Mode). + if (!MPM.isEmpty()) { + MPM.printPipeline(OS, MapClassName2PassName); + OS << ","; + } + OS << "cgscc("; + if (MaxDevirtIterations != 0) + OS << "devirt<" << MaxDevirtIterations << ">("; + PM.printPipeline(OS, MapClassName2PassName); + if (MaxDevirtIterations != 0) + OS << ")"; + OS << ")"; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp index db3b4384ce67..692e445cb7cb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp @@ -201,21 +201,6 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) { AlwaysPreserved.insert(V->getName()); } - // Mark all functions not in the api as internal. - IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm(); - for (Function &I : M) { - if (!maybeInternalize(I, ComdatMap)) - continue; - Changed = true; - - if (ExternalNode) - // Remove a callgraph edge from the external node to this function. - ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]); - - ++NumFunctions; - LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n"); - } - // Never internalize the llvm.used symbol. It is used to implement // attribute((used)). // FIXME: Shouldn't this just filter on llvm.metadata section?? @@ -237,6 +222,21 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) { else AlwaysPreserved.insert("__stack_chk_guard"); + // Mark all functions not in the api as internal. + IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm(); + for (Function &I : M) { + if (!maybeInternalize(I, ComdatMap)) + continue; + Changed = true; + + if (ExternalNode) + // Remove a callgraph edge from the external node to this function. + ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]); + + ++NumFunctions; + LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n"); + } + // Mark all global variables with initializers that are not in the api as // internal as well. for (auto &GV : M.globals()) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp index a497c0390bce..d9a59dd35fde 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -283,3 +283,13 @@ PreservedAnalyses LoopExtractorPass::run(Module &M, ModuleAnalysisManager &AM) { PA.preserve<LoopAnalysis>(); return PA; } + +void LoopExtractorPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<LoopExtractorPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << "<"; + if (NumLoops == 1) + OS << "single"; + OS << ">"; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index b492b200c6d5..f78971f0e586 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -342,7 +342,8 @@ private: struct ScopedSaveAliaseesAndUsed { Module &M; SmallVector<GlobalValue *, 4> Used, CompilerUsed; - std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases; + std::vector<std::pair<GlobalAlias *, Function *>> FunctionAliases; + std::vector<std::pair<GlobalIFunc *, Function *>> ResolverIFuncs; ScopedSaveAliaseesAndUsed(Module &M) : M(M) { // The users of this class want to replace all function references except @@ -362,13 +363,16 @@ struct ScopedSaveAliaseesAndUsed { if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true)) GV->eraseFromParent(); - for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) { + for (auto &GA : M.aliases()) { // FIXME: This should look past all aliases not just interposable ones, // see discussion on D65118. - if (auto *F = - dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts())) - FunctionAliases.push_back({&GIS, F}); + if (auto *F = dyn_cast<Function>(GA.getAliasee()->stripPointerCasts())) + FunctionAliases.push_back({&GA, F}); } + + for (auto &GI : M.ifuncs()) + if (auto *F = dyn_cast<Function>(GI.getResolver()->stripPointerCasts())) + ResolverIFuncs.push_back({&GI, F}); } ~ScopedSaveAliaseesAndUsed() { @@ -376,8 +380,15 @@ struct ScopedSaveAliaseesAndUsed { appendToCompilerUsed(M, CompilerUsed); for (auto P : FunctionAliases) - P.first->setIndirectSymbol( + P.first->setAliasee( ConstantExpr::getBitCast(P.second, P.first->getType())); + + for (auto P : ResolverIFuncs) { + // This does not preserve pointer casts that may have been stripped by the + // constructor, but the resolver's type is different from that of the + // ifunc anyway. + P.first->setResolver(P.second); + } } }; @@ -1550,17 +1561,28 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), ConstantInt::get(IntPtrTy, I)}), F->getType()); - if (Functions[I]->isExported()) { - if (IsJumpTableCanonical) { - ExportSummary->cfiFunctionDefs().insert(std::string(F->getName())); - } else { - GlobalAlias *JtAlias = GlobalAlias::create( - F->getValueType(), 0, GlobalValue::ExternalLinkage, - F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M); + + const bool IsExported = Functions[I]->isExported(); + if (!IsJumpTableCanonical) { + GlobalValue::LinkageTypes LT = IsExported + ? GlobalValue::ExternalLinkage + : GlobalValue::InternalLinkage; + GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, + F->getName() + ".cfi_jt", + CombinedGlobalElemPtr, &M); + if (IsExported) JtAlias->setVisibility(GlobalValue::HiddenVisibility); + else + appendToUsed(M, {JtAlias}); + } + + if (IsExported) { + if (IsJumpTableCanonical) + ExportSummary->cfiFunctionDefs().insert(std::string(F->getName())); + else ExportSummary->cfiFunctionDecls().insert(std::string(F->getName())); - } } + if (!IsJumpTableCanonical) { if (F->hasExternalWeakLinkage()) replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, @@ -1751,11 +1773,7 @@ static bool isDirectCall(Use& U) { void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical) { SmallSetVector<Constant *, 4> Constants; - auto UI = Old->use_begin(), E = Old->use_end(); - for (; UI != E;) { - Use &U = *UI; - ++UI; - + for (Use &U : llvm::make_early_inc_range(Old->uses())) { // Skip block addresses if (isa<BlockAddress>(U.getUser())) continue; @@ -1792,12 +1810,11 @@ bool LowerTypeTestsModule::lower() { M.getFunction(Intrinsic::getName(Intrinsic::type_test)); if (DropTypeTests && TypeTestFunc) { - for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); - UI != UE;) { - auto *CI = cast<CallInst>((*UI++).getUser()); + for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) { + auto *CI = cast<CallInst>(U.getUser()); // Find and erase llvm.assume intrinsics for this llvm.type.test call. - for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;) - if (auto *Assume = dyn_cast<AssumeInst>((*CIU++).getUser())) + for (Use &CIU : llvm::make_early_inc_range(CI->uses())) + if (auto *Assume = dyn_cast<AssumeInst>(CIU.getUser())) Assume->eraseFromParent(); // If the assume was merged with another assume, we might have a use on a // phi (which will feed the assume). Simply replace the use on the phi @@ -1835,13 +1852,9 @@ bool LowerTypeTestsModule::lower() { return false; if (ImportSummary) { - if (TypeTestFunc) { - for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); - UI != UE;) { - auto *CI = cast<CallInst>((*UI++).getUser()); - importTypeTest(CI); - } - } + if (TypeTestFunc) + for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) + importTypeTest(cast<CallInst>(U.getUser())); if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty()) report_fatal_error( @@ -2100,11 +2113,11 @@ bool LowerTypeTestsModule::lower() { auto CI = cast<CallInst>(U.getUser()); std::vector<GlobalTypeMember *> Targets; - if (CI->getNumArgOperands() % 2 != 1) + if (CI->arg_size() % 2 != 1) report_fatal_error("number of arguments should be odd"); GlobalClassesTy::member_iterator CurSet; - for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) { + for (unsigned I = 1; I != CI->arg_size(); I += 2) { int64_t Offset; auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset( CI->getOperand(I), Offset, M.getDataLayout())); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 9e6dd879ac01..97ef872c5499 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -463,17 +463,15 @@ bool MergeFunctions::runOnModule(Module &M) { // Replace direct callers of Old with New. void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType()); - for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) { - Use *U = &*UI; - ++UI; - CallBase *CB = dyn_cast<CallBase>(U->getUser()); - if (CB && CB->isCallee(U)) { + for (Use &U : llvm::make_early_inc_range(Old->uses())) { + CallBase *CB = dyn_cast<CallBase>(U.getUser()); + if (CB && CB->isCallee(&U)) { // Do not copy attributes from the called function to the call-site. // Function comparison ensures that the attributes are the same up to // type congruences in byval(), in which case we need to keep the byval // type of the call-site, not the callee function. remove(CB->getFunction()); - U->set(BitcastNew); + U.set(BitcastNew); } } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ModuleInliner.cpp new file mode 100644 index 000000000000..ebf080e87c3b --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ModuleInliner.cpp @@ -0,0 +1,354 @@ +//===- ModuleInliner.cpp - Code related to module inliner -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the mechanics required to implement inlining without +// missing any calls in the module level. It doesn't need any infromation about +// SCC or call graph, which is different from the SCC inliner. The decisions of +// which calls are profitable to inline are implemented elsewhere. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ModuleInliner.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InlineAdvisor.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InlineOrder.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include <cassert> +#include <functional> + +using namespace llvm; + +#define DEBUG_TYPE "module-inline" + +STATISTIC(NumInlined, "Number of functions inlined"); +STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); + +static cl::opt<bool> InlineEnablePriorityOrder( + "module-inline-enable-priority-order", cl::Hidden, cl::init(true), + cl::desc("Enable the priority inline order for the module inliner")); + +/// Return true if the specified inline history ID +/// indicates an inline history that includes the specified function. +static bool inlineHistoryIncludes( + Function *F, int InlineHistoryID, + const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) { + while (InlineHistoryID != -1) { + assert(unsigned(InlineHistoryID) < InlineHistory.size() && + "Invalid inline history ID"); + if (InlineHistory[InlineHistoryID].first == F) + return true; + InlineHistoryID = InlineHistory[InlineHistoryID].second; + } + return false; +} + +InlineAdvisor &ModuleInlinerPass::getAdvisor(const ModuleAnalysisManager &MAM, + FunctionAnalysisManager &FAM, + Module &M) { + if (OwnedAdvisor) + return *OwnedAdvisor; + + auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M); + if (!IAA) { + // It should still be possible to run the inliner as a stand-alone module + // pass, for test scenarios. In that case, we default to the + // DefaultInlineAdvisor, which doesn't need to keep state between module + // pass runs. It also uses just the default InlineParams. In this case, we + // need to use the provided FAM, which is valid for the duration of the + // inliner pass, and thus the lifetime of the owned advisor. The one we + // would get from the MAM can be invalidated as a result of the inliner's + // activity. + OwnedAdvisor = std::make_unique<DefaultInlineAdvisor>(M, FAM, Params); + + return *OwnedAdvisor; + } + assert(IAA->getAdvisor() && + "Expected a present InlineAdvisorAnalysis also have an " + "InlineAdvisor initialized"); + return *IAA->getAdvisor(); +} + +static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) { + LibFunc LF; + + // Either this is a normal library function or a "vectorizable" + // function. Not using the VFDatabase here because this query + // is related only to libraries handled via the TLI. + return TLI.getLibFunc(F, LF) || + TLI.isKnownVectorFunctionInLibrary(F.getName()); +} + +PreservedAnalyses ModuleInlinerPass::run(Module &M, + ModuleAnalysisManager &MAM) { + LLVM_DEBUG(dbgs() << "---- Module Inliner is Running ---- \n"); + + auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M); + if (!IAA.tryCreate(Params, Mode, {})) { + M.getContext().emitError( + "Could not setup Inlining Advisor for the requested " + "mode and/or options"); + return PreservedAnalyses::all(); + } + + bool Changed = false; + + ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M); + + FunctionAnalysisManager &FAM = + MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + }; + + InlineAdvisor &Advisor = getAdvisor(MAM, FAM, M); + Advisor.onPassEntry(); + + auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); + + // In the module inliner, a priority-based worklist is used for calls across + // the entire Module. With this module inliner, the inline order is not + // limited to bottom-up order. More globally scope inline order is enabled. + // Also, the inline deferral logic become unnecessary in this module inliner. + // It is possible to use other priority heuristics, e.g. profile-based + // heuristic. + // + // TODO: Here is a huge amount duplicate code between the module inliner and + // the SCC inliner, which need some refactoring. + std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls; + if (InlineEnablePriorityOrder) + Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>(); + else + Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>(); + assert(Calls != nullptr && "Expected an initialized InlineOrder"); + + // Populate the initial list of calls in this module. + for (Function &F : M) { + auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. + // FIXME: Using instructions sequence is a really bad way to do this. + // Instead we should do an actual RPO walk of the function body. + for (Instruction &I : instructions(F)) + if (auto *CB = dyn_cast<CallBase>(&I)) + if (Function *Callee = CB->getCalledFunction()) { + if (!Callee->isDeclaration()) + Calls->push({CB, -1}); + else if (!isa<IntrinsicInst>(I)) { + using namespace ore; + setInlineRemark(*CB, "unavailable definition"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) + << NV("Callee", Callee) << " will not be inlined into " + << NV("Caller", CB->getCaller()) + << " because its definition is unavailable" + << setIsVerbose(); + }); + } + } + } + if (Calls->empty()) + return PreservedAnalyses::all(); + + // When inlining a callee produces new call sites, we want to keep track of + // the fact that they were inlined from the callee. This allows us to avoid + // infinite inlining in some obscure cases. To represent this, we use an + // index into the InlineHistory vector. + SmallVector<std::pair<Function *, int>, 16> InlineHistory; + + // Track a set vector of inlined callees so that we can augment the caller + // with all of their edges in the call graph before pruning out the ones that + // got simplified away. + SmallSetVector<Function *, 4> InlinedCallees; + + // Track the dead functions to delete once finished with inlining calls. We + // defer deleting these to make it easier to handle the call graph updates. + SmallVector<Function *, 4> DeadFunctions; + + // Loop forward over all of the calls. + while (!Calls->empty()) { + // We expect the calls to typically be batched with sequences of calls that + // have the same caller, so we first set up some shared infrastructure for + // this caller. We also do any pruning we can at this layer on the caller + // alone. + Function &F = *Calls->front().first->getCaller(); + + LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n" + << " Function size: " << F.getInstructionCount() + << "\n"); + + auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { + return FAM.getResult<AssumptionAnalysis>(F); + }; + + // Now process as many calls as we have within this caller in the sequence. + // We bail out as soon as the caller has to change so we can + // prepare the context of that new caller. + bool DidInline = false; + while (!Calls->empty() && Calls->front().first->getCaller() == &F) { + auto P = Calls->pop(); + CallBase *CB = P.first; + const int InlineHistoryID = P.second; + Function &Callee = *CB->getCalledFunction(); + + if (InlineHistoryID != -1 && + inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) { + setInlineRemark(*CB, "recursive"); + continue; + } + + auto Advice = Advisor.getAdvice(*CB, /*OnlyMandatory*/ false); + // Check whether we want to inline this callsite. + if (!Advice->isInliningRecommended()) { + Advice->recordUnattemptedInlining(); + continue; + } + + // Setup the data structure used to plumb customization into the + // `InlineFunction` routine. + InlineFunctionInfo IFI( + /*cg=*/nullptr, GetAssumptionCache, PSI, + &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())), + &FAM.getResult<BlockFrequencyAnalysis>(Callee)); + + InlineResult IR = + InlineFunction(*CB, IFI, &FAM.getResult<AAManager>(*CB->getCaller())); + if (!IR.isSuccess()) { + Advice->recordUnsuccessfulInlining(IR); + continue; + } + + DidInline = true; + InlinedCallees.insert(&Callee); + ++NumInlined; + + LLVM_DEBUG(dbgs() << " Size after inlining: " + << F.getInstructionCount() << "\n"); + + // Add any new callsites to defined functions to the worklist. + if (!IFI.InlinedCallSites.empty()) { + int NewHistoryID = InlineHistory.size(); + InlineHistory.push_back({&Callee, InlineHistoryID}); + + for (CallBase *ICB : reverse(IFI.InlinedCallSites)) { + Function *NewCallee = ICB->getCalledFunction(); + if (!NewCallee) { + // Try to promote an indirect (virtual) call without waiting for + // the post-inline cleanup and the next DevirtSCCRepeatedPass + // iteration because the next iteration may not happen and we may + // miss inlining it. + if (tryPromoteCall(*ICB)) + NewCallee = ICB->getCalledFunction(); + } + if (NewCallee) + if (!NewCallee->isDeclaration()) + Calls->push({ICB, NewHistoryID}); + } + } + + // Merge the attributes based on the inlining. + AttributeFuncs::mergeAttributesForInlining(F, Callee); + + // For local functions, check whether this makes the callee trivially + // dead. In that case, we can drop the body of the function eagerly + // which may reduce the number of callers of other functions to one, + // changing inline cost thresholds. + bool CalleeWasDeleted = false; + if (Callee.hasLocalLinkage()) { + // To check this we also need to nuke any dead constant uses (perhaps + // made dead by this operation on other functions). + Callee.removeDeadConstantUsers(); + // if (Callee.use_empty() && !CG.isLibFunction(Callee)) { + if (Callee.use_empty() && !isKnownLibFunction(Callee, GetTLI(Callee))) { + Calls->erase_if([&](const std::pair<CallBase *, int> &Call) { + return Call.first->getCaller() == &Callee; + }); + // Clear the body and queue the function itself for deletion when we + // finish inlining. + // Note that after this point, it is an error to do anything other + // than use the callee's address or delete it. + Callee.dropAllReferences(); + assert(!is_contained(DeadFunctions, &Callee) && + "Cannot put cause a function to become dead twice!"); + DeadFunctions.push_back(&Callee); + CalleeWasDeleted = true; + } + } + if (CalleeWasDeleted) + Advice->recordInliningWithCalleeDeleted(); + else + Advice->recordInlining(); + } + + if (!DidInline) + continue; + Changed = true; + + InlinedCallees.clear(); + } + + // Now that we've finished inlining all of the calls across this module, + // delete all of the trivially dead functions. + // + // Note that this walks a pointer set which has non-deterministic order but + // that is OK as all we do is delete things and add pointers to unordered + // sets. + for (Function *DeadF : DeadFunctions) { + // Clear out any cached analyses. + FAM.clear(*DeadF, DeadF->getName()); + + // And delete the actual function from the module. + // The Advisor may use Function pointers to efficiently index various + // internal maps, e.g. for memoization. Function cleanup passes like + // argument promotion create new functions. It is possible for a new + // function to be allocated at the address of a deleted function. We could + // index using names, but that's inefficient. Alternatively, we let the + // Advisor free the functions when it sees fit. + DeadF->getBasicBlockList().clear(); + M.getFunctionList().remove(DeadF); + + ++NumDeleted; + } + + if (!Changed) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 75eec25f5807..f342c35fa283 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -33,6 +34,8 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -41,6 +44,8 @@ #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/CodeExtractor.h" +#include <algorithm> + using namespace llvm; using namespace omp; @@ -72,6 +77,46 @@ static cl::opt<bool> HideMemoryTransferLatency( " transfers"), cl::Hidden, cl::init(false)); +static cl::opt<bool> DisableOpenMPOptDeglobalization( + "openmp-opt-disable-deglobalization", cl::ZeroOrMore, + cl::desc("Disable OpenMP optimizations involving deglobalization."), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> DisableOpenMPOptSPMDization( + "openmp-opt-disable-spmdization", cl::ZeroOrMore, + cl::desc("Disable OpenMP optimizations involving SPMD-ization."), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> DisableOpenMPOptFolding( + "openmp-opt-disable-folding", cl::ZeroOrMore, + cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, + cl::init(false)); + +static cl::opt<bool> DisableOpenMPOptStateMachineRewrite( + "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore, + cl::desc("Disable OpenMP optimizations that replace the state machine."), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> PrintModuleAfterOptimizations( + "openmp-opt-print-module", cl::ZeroOrMore, + cl::desc("Print the current module after OpenMP optimizations."), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> AlwaysInlineDeviceFunctions( + "openmp-opt-inline-device", cl::ZeroOrMore, + cl::desc("Inline all applicible functions on the device."), cl::Hidden, + cl::init(false)); + +static cl::opt<bool> + EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore, + cl::desc("Enables more verbose remarks."), cl::Hidden, + cl::init(false)); + +static cl::opt<unsigned> + SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, + cl::desc("Maximal number of attributor iterations."), + cl::init(256)); + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -328,7 +373,7 @@ struct OMPInformationCache : public InformationCache { if (F->arg_size() != RTFArgTypes.size()) return false; - auto RTFTyIt = RTFArgTypes.begin(); + auto *RTFTyIt = RTFArgTypes.begin(); for (Argument &Arg : F->args()) { if (Arg.getType() != *RTFTyIt) return false; @@ -503,7 +548,7 @@ struct KernelInfoState : AbstractState { /// State to track if we are in SPMD-mode, assumed or know, and why we decided /// we cannot be. If it is assumed, then RequiresFullRuntime should also be /// false. - BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker; + BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker; /// The __kmpc_target_init call in this kernel, if any. If we find more than /// one we abort as the kernel is malformed. @@ -542,7 +587,9 @@ struct KernelInfoState : AbstractState { /// See AbstractState::indicatePessimisticFixpoint(...) ChangeStatus indicatePessimisticFixpoint() override { IsAtFixpoint = true; + ReachingKernelEntries.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + ReachedKnownParallelRegions.indicatePessimisticFixpoint(); ReachedUnknownParallelRegions.indicatePessimisticFixpoint(); return ChangeStatus::CHANGED; } @@ -550,6 +597,10 @@ struct KernelInfoState : AbstractState { /// See AbstractState::indicateOptimisticFixpoint(...) ChangeStatus indicateOptimisticFixpoint() override { IsAtFixpoint = true; + ReachingKernelEntries.indicateOptimisticFixpoint(); + SPMDCompatibilityTracker.indicateOptimisticFixpoint(); + ReachedKnownParallelRegions.indicateOptimisticFixpoint(); + ReachedUnknownParallelRegions.indicateOptimisticFixpoint(); return ChangeStatus::UNCHANGED; } @@ -569,6 +620,12 @@ struct KernelInfoState : AbstractState { return true; } + /// Returns true if this kernel contains any OpenMP parallel regions. + bool mayContainParallelRegion() { + return !ReachedKnownParallelRegions.empty() || + !ReachedUnknownParallelRegions.empty(); + } + /// Return empty set as the best state of potential values. static KernelInfoState getBestState() { return KernelInfoState(true); } @@ -584,12 +641,14 @@ struct KernelInfoState : AbstractState { // Do not merge two different _init and _deinit call sites. if (KIS.KernelInitCB) { if (KernelInitCB && KernelInitCB != KIS.KernelInitCB) - indicatePessimisticFixpoint(); + llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt " + "assumptions."); KernelInitCB = KIS.KernelInitCB; } if (KIS.KernelDeinitCB) { if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB) - indicatePessimisticFixpoint(); + llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt " + "assumptions."); KernelDeinitCB = KIS.KernelDeinitCB; } SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; @@ -1032,8 +1091,8 @@ private: Args.clear(); Args.push_back(OutlinedFn->getArg(0)); Args.push_back(OutlinedFn->getArg(1)); - for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); - U < E; ++U) + for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E; + ++U) Args.push_back(CI->getArgOperand(U)); CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); @@ -1041,9 +1100,9 @@ private: NewCI->setDebugLoc(CI->getDebugLoc()); // Forward parameter attributes from the callback to the callee. - for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); - U < E; ++U) - for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) + for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E; + ++U) + for (const Attribute &A : CI->getAttributes().getParamAttrs(U)) NewCI->addParamAttr( U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); @@ -1563,13 +1622,13 @@ private: // TODO: Use dominance to find a good position instead. auto CanBeMoved = [this](CallBase &CB) { - unsigned NumArgs = CB.getNumArgOperands(); + unsigned NumArgs = CB.arg_size(); if (NumArgs == 0) return true; if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) return false; - for (unsigned u = 1; u < NumArgs; ++u) - if (isa<Instruction>(CB.getArgOperand(u))) + for (unsigned U = 1; U < NumArgs; ++U) + if (isa<Instruction>(CB.getArgOperand(U))) return false; return true; }; @@ -1612,7 +1671,7 @@ private: // valid at the new location. For now we just pick a global one, either // existing and used by one of the calls, or created from scratch. if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { - if (CI->getNumArgOperands() > 0 && + if (!CI->arg_empty() && CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, /* GlobalOnly */ true); @@ -1695,8 +1754,8 @@ private: // Transitively search for more arguments by looking at the users of the // ones we know already. During the search the GTIdArgs vector is extended // so we cannot cache the size nor can we use a range based for. - for (unsigned u = 0; u < GTIdArgs.size(); ++u) - AddUserArgs(*GTIdArgs[u]); + for (unsigned U = 0; U < GTIdArgs.size(); ++U) + AddUserArgs(*GTIdArgs[U]); } /// Kernel (=GPU) optimizations and utility functions @@ -1822,6 +1881,10 @@ private: OMPRTL___kmpc_kernel_end_parallel); ExternalizationRAII BarrierSPMD(OMPInfoCache, OMPRTL___kmpc_barrier_simple_spmd); + ExternalizationRAII BarrierGeneric(OMPInfoCache, + OMPRTL___kmpc_barrier_simple_generic); + ExternalizationRAII ThreadId(OMPInfoCache, + OMPRTL___kmpc_get_hardware_thread_id_in_block); registerAAs(IsModulePass); @@ -1918,6 +1981,10 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() { if (!KernelParallelRFI) return Changed; + // If we have disabled state machine changes, exit + if (DisableOpenMPOptStateMachineRewrite) + return Changed; + for (Function *F : SCC) { // Check if the function is a use in a __kmpc_parallel_51 call at @@ -2509,9 +2576,8 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; - // Check if the edge into the successor block compares the __kmpc_target_init - // result with -1. If we are in non-SPMD-mode that signals only the main - // thread will execute the edge. + // Check if the edge into the successor block contains a condition that only + // lets the main thread execute it. auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { if (!Edge || !Edge->isConditional()) return false; @@ -2526,16 +2592,27 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { if (!C) return false; - // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) + // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) if (C->isAllOnesValue()) { auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0)); CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; if (!CB) return false; - const int InitIsSPMDArgNo = 1; - auto *IsSPMDModeCI = - dyn_cast<ConstantInt>(CB->getOperand(InitIsSPMDArgNo)); - return IsSPMDModeCI && IsSPMDModeCI->isZero(); + const int InitModeArgNo = 1; + auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo)); + return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC); + } + + if (C->isZero()) { + // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x() + if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0))) + if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x) + return true; + + // Match: 0 == llvm.amdgcn.workitem.id.x() + if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0))) + if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x) + return true; } return false; @@ -2544,15 +2621,14 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // Merge all the predecessor states into the current basic block. A basic // block is executed by a single thread if all of its predecessors are. auto MergePredecessorStates = [&](BasicBlock *BB) { - if (pred_begin(BB) == pred_end(BB)) + if (pred_empty(BB)) return SingleThreadedBBs.contains(BB); bool IsInitialThread = true; - for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB); - PredBB != PredEndBB; ++PredBB) { - if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()), + for (BasicBlock *PredBB : predecessors(BB)) { + if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()), BB)) - IsInitialThread &= SingleThreadedBBs.contains(*PredBB); + IsInitialThread &= SingleThreadedBBs.contains(PredBB); } return IsInitialThread; @@ -2684,9 +2760,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared { ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0)); - LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in " - << CB->getCaller()->getName() << " with " - << AllocSize->getZExtValue() + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB + << " with " << AllocSize->getZExtValue() << " bytes of shared memory\n"); // Create a new shared memory buffer of the same size as the allocation @@ -2735,7 +2810,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { const auto &ED = A.getAAFor<AAExecutionDomain>( *this, IRPosition::function(*F), DepClassTy::REQUIRED); if (CallBase *CB = dyn_cast<CallBase>(U)) - if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) || + if (!isa<ConstantInt>(CB->getArgOperand(0)) || !ED.isExecutedByInitialThreadOnly(*CB)) MallocCalls.erase(CB); } @@ -2770,9 +2845,17 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]" : "") + std::string(" #PRs: ") + - std::to_string(ReachedKnownParallelRegions.size()) + + (ReachedKnownParallelRegions.isValidState() + ? std::to_string(ReachedKnownParallelRegions.size()) + : "<invalid>") + ", #Unknown PRs: " + - std::to_string(ReachedUnknownParallelRegions.size()); + (ReachedUnknownParallelRegions.isValidState() + ? std::to_string(ReachedUnknownParallelRegions.size()) + : "<invalid>") + + ", #Reaching Kernels: " + + (ReachingKernelEntries.isValidState() + ? std::to_string(ReachingKernelEntries.size()) + : "<invalid>"); } /// Create an abstract attribute biew for the position \p IRP. @@ -2798,6 +2881,12 @@ struct AAKernelInfoFunction : AAKernelInfo { AAKernelInfoFunction(const IRPosition &IRP, Attributor &A) : AAKernelInfo(IRP, A) {} + SmallPtrSet<Instruction *, 4> GuardedInstructions; + + SmallPtrSetImpl<Instruction *> &getGuardedInstructions() { + return GuardedInstructions; + } + /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { // This is a high-level transform that might change the constant arguments @@ -2844,8 +2933,11 @@ struct AAKernelInfoFunction : AAKernelInfo { }, Fn); - assert((KernelInitCB && KernelDeinitCB) && - "Kernel without __kmpc_target_init or __kmpc_target_deinit!"); + // Ignore kernels without initializers such as global constructors. + if (!KernelInitCB || !KernelDeinitCB) { + indicateOptimisticFixpoint(); + return; + } // For kernels we might need to initialize/finalize the IsSPMD state and // we need to register a simplification callback so that the Attributor @@ -2860,7 +2952,10 @@ struct AAKernelInfoFunction : AAKernelInfo { // state. As long as we are not in an invalid state, we will create a // custom state machine so the value should be a `i1 false`. If we are // in an invalid state, we won't change the value that is in the IR. - if (!isValidState()) + if (!ReachedKnownParallelRegions.isValidState()) + return nullptr; + // If we have disabled state machine rewrites, don't make a custom one. + if (DisableOpenMPOptStateMachineRewrite) return nullptr; if (AA) A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); @@ -2870,7 +2965,7 @@ struct AAKernelInfoFunction : AAKernelInfo { return FalseVal; }; - Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB = + Attributor::SimplifictionCallbackTy ModeSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional<Value *> { // IRP represents the "SPMDCompatibilityTracker" argument of an @@ -2886,8 +2981,10 @@ struct AAKernelInfoFunction : AAKernelInfo { } else { UsedAssumedInformation = false; } - auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(), - SPMDCompatibilityTracker.isAssumed()); + auto *Val = ConstantInt::getSigned( + IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()), + SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD + : OMP_TGT_EXEC_MODE_GENERIC); return Val; }; @@ -2912,8 +3009,8 @@ struct AAKernelInfoFunction : AAKernelInfo { return Val; }; - constexpr const int InitIsSPMDArgNo = 1; - constexpr const int DeinitIsSPMDArgNo = 1; + constexpr const int InitModeArgNo = 1; + constexpr const int DeinitModeArgNo = 1; constexpr const int InitUseStateMachineArgNo = 2; constexpr const int InitRequiresFullRuntimeArgNo = 3; constexpr const int DeinitRequiresFullRuntimeArgNo = 2; @@ -2921,11 +3018,11 @@ struct AAKernelInfoFunction : AAKernelInfo { IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), StateMachineSimplifyCB); A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo), - IsSPMDModeSimplifyCB); + IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo), + ModeSimplifyCB); A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo), - IsSPMDModeSimplifyCB); + IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo), + ModeSimplifyCB); A.registerSimplificationCallback( IRPosition::callsite_argument(*KernelInitCB, InitRequiresFullRuntimeArgNo), @@ -2936,10 +3033,25 @@ struct AAKernelInfoFunction : AAKernelInfo { IsGenericModeSimplifyCB); // Check if we know we are in SPMD-mode already. - ConstantInt *IsSPMDArg = - dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); - if (IsSPMDArg && !IsSPMDArg->isZero()) + ConstantInt *ModeArg = + dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo)); + if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) SPMDCompatibilityTracker.indicateOptimisticFixpoint(); + // This is a generic region but SPMDization is disabled so stop tracking. + else if (DisableOpenMPOptSPMDization) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + } + + /// Sanitize the string \p S such that it is a suitable global symbol name. + static std::string sanitizeForGlobalName(std::string S) { + std::replace_if( + S.begin(), S.end(), + [](const char C) { + return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') || + (C >= '0' && C <= '9') || C == '_'); + }, + '.'); + return S; } /// Modify the IR based on the KernelInfoState as the fixpoint iteration is @@ -2950,19 +3062,16 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!KernelInitCB || !KernelDeinitCB) return ChangeStatus::UNCHANGED; - // Known SPMD-mode kernels need no manifest changes. - if (SPMDCompatibilityTracker.isKnown()) - return ChangeStatus::UNCHANGED; - // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. - if (!changeToSPMDMode(A)) - buildCustomStateMachine(A); + ChangeStatus Changed = ChangeStatus::UNCHANGED; + if (!changeToSPMDMode(A, Changed)) + return buildCustomStateMachine(A); - return ChangeStatus::CHANGED; + return Changed; } - bool changeToSPMDMode(Attributor &A) { + bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); if (!SPMDCompatibilityTracker.isAssumed()) { @@ -2994,38 +3103,259 @@ struct AAKernelInfoFunction : AAKernelInfo { return false; } - // Adjust the global exec mode flag that tells the runtime what mode this - // kernel is executed in. + // Check if the kernel is already in SPMD mode, if so, return success. Function *Kernel = getAnchorScope(); GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( (Kernel->getName() + "_exec_mode").str()); assert(ExecMode && "Kernel without exec mode?"); - assert(ExecMode->getInitializer() && - ExecMode->getInitializer()->isOneValue() && - "Initially non-SPMD kernel has SPMD exec mode!"); + assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); // Set the global exec mode flag to indicate SPMD-Generic mode. - constexpr int SPMDGeneric = 2; - if (!ExecMode->getInitializer()->isZeroValue()) - ExecMode->setInitializer( - ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric)); + assert(isa<ConstantInt>(ExecMode->getInitializer()) && + "ExecMode is not an integer!"); + const int8_t ExecModeVal = + cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue(); + if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC) + return true; + + // We will now unconditionally modify the IR, indicate a change. + Changed = ChangeStatus::CHANGED; + + auto CreateGuardedRegion = [&](Instruction *RegionStartI, + Instruction *RegionEndI) { + LoopInfo *LI = nullptr; + DominatorTree *DT = nullptr; + MemorySSAUpdater *MSU = nullptr; + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + + BasicBlock *ParentBB = RegionStartI->getParent(); + Function *Fn = ParentBB->getParent(); + Module &M = *Fn->getParent(); + + // Create all the blocks and logic. + // ParentBB: + // goto RegionCheckTidBB + // RegionCheckTidBB: + // Tid = __kmpc_hardware_thread_id() + // if (Tid != 0) + // goto RegionBarrierBB + // RegionStartBB: + // <execute instructions guarded> + // goto RegionEndBB + // RegionEndBB: + // <store escaping values to shared mem> + // goto RegionBarrierBB + // RegionBarrierBB: + // __kmpc_simple_barrier_spmd() + // // second barrier is omitted if lacking escaping values. + // <load escaping values from shared mem> + // __kmpc_simple_barrier_spmd() + // goto RegionExitBB + // RegionExitBB: + // <execute rest of instructions> + + BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(), + DT, LI, MSU, "region.guarded.end"); + BasicBlock *RegionBarrierBB = + SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI, + MSU, "region.barrier"); + BasicBlock *RegionExitBB = + SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(), + DT, LI, MSU, "region.exit"); + BasicBlock *RegionStartBB = + SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded"); + + assert(ParentBB->getUniqueSuccessor() == RegionStartBB && + "Expected a different CFG"); + + BasicBlock *RegionCheckTidBB = SplitBlock( + ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid"); + + // Register basic blocks with the Attributor. + A.registerManifestAddedBasicBlock(*RegionEndBB); + A.registerManifestAddedBasicBlock(*RegionBarrierBB); + A.registerManifestAddedBasicBlock(*RegionExitBB); + A.registerManifestAddedBasicBlock(*RegionStartBB); + A.registerManifestAddedBasicBlock(*RegionCheckTidBB); + + bool HasBroadcastValues = false; + // Find escaping outputs from the guarded region to outside users and + // broadcast their values to them. + for (Instruction &I : *RegionStartBB) { + SmallPtrSet<Instruction *, 4> OutsideUsers; + for (User *Usr : I.users()) { + Instruction &UsrI = *cast<Instruction>(Usr); + if (UsrI.getParent() != RegionStartBB) + OutsideUsers.insert(&UsrI); + } + + if (OutsideUsers.empty()) + continue; + + HasBroadcastValues = true; + + // Emit a global variable in shared memory to store the broadcasted + // value. + auto *SharedMem = new GlobalVariable( + M, I.getType(), /* IsConstant */ false, + GlobalValue::InternalLinkage, UndefValue::get(I.getType()), + sanitizeForGlobalName( + (I.getName() + ".guarded.output.alloc").str()), + nullptr, GlobalValue::NotThreadLocal, + static_cast<unsigned>(AddressSpace::Shared)); + + // Emit a store instruction to update the value. + new StoreInst(&I, SharedMem, RegionEndBB->getTerminator()); + + LoadInst *LoadI = new LoadInst(I.getType(), SharedMem, + I.getName() + ".guarded.output.load", + RegionBarrierBB->getTerminator()); + + // Emit a load instruction and replace uses of the output value. + for (Instruction *UsrI : OutsideUsers) + UsrI->replaceUsesOfWith(&I, LoadI); + } + + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + + // Go to tid check BB in ParentBB. + const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); + ParentBB->getTerminator()->eraseFromParent(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(ParentBB, ParentBB->end()), DL); + OMPInfoCache.OMPBuilder.updateToLocation(Loc); + auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc); + Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr); + BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL); + + // Add check for Tid in RegionCheckTidBB + RegionCheckTidBB->getTerminator()->eraseFromParent(); + OpenMPIRBuilder::LocationDescription LocRegionCheckTid( + InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL); + OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid); + FunctionCallee HardwareTidFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_hardware_thread_id_in_block); + Value *Tid = + OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {}); + Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid); + OMPInfoCache.OMPBuilder.Builder + .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB) + ->setDebugLoc(DL); + + // First barrier for synchronization, ensures main thread has updated + // values. + FunctionCallee BarrierFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_barrier_simple_spmd); + OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy( + RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt())); + OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}) + ->setDebugLoc(DL); + + // Second barrier ensures workers have read broadcast values. + if (HasBroadcastValues) + CallInst::Create(BarrierFn, {Ident, Tid}, "", + RegionBarrierBB->getTerminator()) + ->setDebugLoc(DL); + }; + + auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + SmallPtrSet<BasicBlock *, 8> Visited; + for (Instruction *GuardedI : SPMDCompatibilityTracker) { + BasicBlock *BB = GuardedI->getParent(); + if (!Visited.insert(BB).second) + continue; + + SmallVector<std::pair<Instruction *, Instruction *>> Reorders; + Instruction *LastEffect = nullptr; + BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend(); + while (++IP != IPEnd) { + if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory()) + continue; + Instruction *I = &*IP; + if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI)) + continue; + if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) { + LastEffect = nullptr; + continue; + } + if (LastEffect) + Reorders.push_back({I, LastEffect}); + LastEffect = &*IP; + } + for (auto &Reorder : Reorders) + Reorder.first->moveBefore(Reorder.second); + } + + SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions; + + for (Instruction *GuardedI : SPMDCompatibilityTracker) { + BasicBlock *BB = GuardedI->getParent(); + auto *CalleeAA = A.lookupAAFor<AAKernelInfo>( + IRPosition::function(*GuardedI->getFunction()), nullptr, + DepClassTy::NONE); + assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo"); + auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA); + // Continue if instruction is already guarded. + if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI)) + continue; + + Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr; + for (Instruction &I : *BB) { + // If instruction I needs to be guarded update the guarded region + // bounds. + if (SPMDCompatibilityTracker.contains(&I)) { + CalleeAAFunction.getGuardedInstructions().insert(&I); + if (GuardedRegionStart) + GuardedRegionEnd = &I; + else + GuardedRegionStart = GuardedRegionEnd = &I; + + continue; + } + + // Instruction I does not need guarding, store + // any region found and reset bounds. + if (GuardedRegionStart) { + GuardedRegions.push_back( + std::make_pair(GuardedRegionStart, GuardedRegionEnd)); + GuardedRegionStart = nullptr; + GuardedRegionEnd = nullptr; + } + } + } + + for (auto &GR : GuardedRegions) + CreateGuardedRegion(GR.first, GR.second); + + // Adjust the global exec mode flag that tells the runtime what mode this + // kernel is executed in. + assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && + "Initially non-SPMD kernel has SPMD exec mode!"); + ExecMode->setInitializer( + ConstantInt::get(ExecMode->getInitializer()->getType(), + ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD)); // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. - const int InitIsSPMDArgNo = 1; - const int DeinitIsSPMDArgNo = 1; + const int InitModeArgNo = 1; + const int DeinitModeArgNo = 1; const int InitUseStateMachineArgNo = 2; const int InitRequiresFullRuntimeArgNo = 3; const int DeinitRequiresFullRuntimeArgNo = 2; auto &Ctx = getAnchorValue().getContext(); - A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo), - *ConstantInt::getBool(Ctx, 1)); + A.changeUseAfterManifest( + KernelInitCB->getArgOperandUse(InitModeArgNo), + *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), + OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *ConstantInt::getBool(Ctx, 0)); A.changeUseAfterManifest( - KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo), - *ConstantInt::getBool(Ctx, 1)); + KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), + *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), + OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), *ConstantInt::getBool(Ctx, 0)); @@ -3043,10 +3373,15 @@ struct AAKernelInfoFunction : AAKernelInfo { }; ChangeStatus buildCustomStateMachine(Attributor &A) { - assert(ReachedKnownParallelRegions.isValidState() && - "Custom state machine with invalid parallel region states?"); + // If we have disabled state machine rewrites, don't make a custom one + if (DisableOpenMPOptStateMachineRewrite) + return ChangeStatus::UNCHANGED; + + // Don't rewrite the state machine if we are not in a valid state. + if (!ReachedKnownParallelRegions.isValidState()) + return ChangeStatus::UNCHANGED; - const int InitIsSPMDArgNo = 1; + const int InitModeArgNo = 1; const int InitUseStateMachineArgNo = 2; // Check if the current configuration is non-SPMD and generic state machine. @@ -3055,14 +3390,14 @@ struct AAKernelInfoFunction : AAKernelInfo { // we give up. ConstantInt *UseStateMachine = dyn_cast<ConstantInt>( KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); - ConstantInt *IsSPMD = - dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); + ConstantInt *Mode = + dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo)); // If we are stuck with generic mode, try to create a custom device (=GPU) // state machine which is specialized for the parallel regions that are // reachable by the kernel. - if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD || - !IsSPMD->isZero()) + if (!UseStateMachine || UseStateMachine->isZero() || !Mode || + (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) return ChangeStatus::UNCHANGED; // If not SPMD mode, indicate we use a custom state machine now. @@ -3075,8 +3410,7 @@ struct AAKernelInfoFunction : AAKernelInfo { // happen if there simply are no parallel regions. In the resulting kernel // all worker threads will simply exit right away, leaving the main thread // to do the work alone. - if (ReachedKnownParallelRegions.empty() && - ReachedUnknownParallelRegions.empty()) { + if (!mayContainParallelRegion()) { ++NumOpenMPTargetRegionKernelsWithoutStateMachine; auto Remark = [&](OptimizationRemark OR) { @@ -3122,9 +3456,14 @@ struct AAKernelInfoFunction : AAKernelInfo { // Create all the blocks: // // InitCB = __kmpc_target_init(...) - // bool IsWorker = InitCB >= 0; + // BlockHwSize = + // __kmpc_get_hardware_num_threads_in_block(); + // WarpSize = __kmpc_get_warp_size(); + // BlockSize = BlockHwSize - WarpSize; + // if (InitCB >= BlockSize) return; + // IsWorkerCheckBB: bool IsWorker = InitCB >= 0; // if (IsWorker) { - // SMBeginBB: __kmpc_barrier_simple_spmd(...); + // SMBeginBB: __kmpc_barrier_simple_generic(...); // void *WorkFn; // bool Active = __kmpc_kernel_parallel(&WorkFn); // if (!WorkFn) return; @@ -3138,7 +3477,7 @@ struct AAKernelInfoFunction : AAKernelInfo { // ((WorkFnTy*)WorkFn)(...); // SMEndParallelBB: __kmpc_kernel_end_parallel(...); // } - // SMDoneBB: __kmpc_barrier_simple_spmd(...); + // SMDoneBB: __kmpc_barrier_simple_generic(...); // goto SMBeginBB; // } // UserCodeEntryBB: // user code @@ -3150,6 +3489,8 @@ struct AAKernelInfoFunction : AAKernelInfo { BasicBlock *InitBB = KernelInitCB->getParent(); BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock( KernelInitCB->getNextNode(), "thread.user_code.check"); + BasicBlock *IsWorkerCheckBB = + BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB); BasicBlock *StateMachineBeginBB = BasicBlock::Create( Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB); BasicBlock *StateMachineFinishedBB = BasicBlock::Create( @@ -3166,6 +3507,7 @@ struct AAKernelInfoFunction : AAKernelInfo { Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB); A.registerManifestAddedBasicBlock(*InitBB); A.registerManifestAddedBasicBlock(*UserCodeEntryBB); + A.registerManifestAddedBasicBlock(*IsWorkerCheckBB); A.registerManifestAddedBasicBlock(*StateMachineBeginBB); A.registerManifestAddedBasicBlock(*StateMachineFinishedBB); A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB); @@ -3175,16 +3517,38 @@ struct AAKernelInfoFunction : AAKernelInfo { const DebugLoc &DLoc = KernelInitCB->getDebugLoc(); ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc); - InitBB->getTerminator()->eraseFromParent(); + + Module &M = *Kernel->getParent(); + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + FunctionCallee BlockHwSizeFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_hardware_num_threads_in_block); + FunctionCallee WarpSizeFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_warp_size); + Instruction *BlockHwSize = + CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB); + BlockHwSize->setDebugLoc(DLoc); + Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + WarpSize->setDebugLoc(DLoc); + Instruction *BlockSize = + BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB); + BlockSize->setDebugLoc(DLoc); + Instruction *IsMainOrWorker = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, + BlockSize, "thread.is_main_or_worker", InitBB); + IsMainOrWorker->setDebugLoc(DLoc); + BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker, + InitBB); + Instruction *IsWorker = ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, ConstantInt::get(KernelInitCB->getType(), -1), - "thread.is_worker", InitBB); + "thread.is_worker", IsWorkerCheckBB); IsWorker->setDebugLoc(DLoc); - BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); - - Module &M = *Kernel->getParent(); + BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, + IsWorkerCheckBB); // Create local storage for the work function pointer. const DataLayout &DL = M.getDataLayout(); @@ -3194,7 +3558,6 @@ struct AAKernelInfoFunction : AAKernelInfo { "worker.work_fn.addr", &Kernel->getEntryBlock().front()); WorkFnAI->setDebugLoc(DLoc); - auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); OMPInfoCache.OMPBuilder.updateToLocation( OpenMPIRBuilder::LocationDescription( IRBuilder<>::InsertPoint(StateMachineBeginBB, @@ -3206,7 +3569,7 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee BarrierFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___kmpc_barrier_simple_spmd); + M, OMPRTL___kmpc_barrier_simple_generic); CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB) ->setDebugLoc(DLoc); @@ -3258,8 +3621,8 @@ struct AAKernelInfoFunction : AAKernelInfo { // Now that we have most of the CFG skeleton it is time for the if-cascade // that checks the function pointer we got from the runtime against the // parallel regions we expect, if there are any. - for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) { - auto *ParallelRegion = ReachedKnownParallelRegions[i]; + for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) { + auto *ParallelRegion = ReachedKnownParallelRegions[I]; BasicBlock *PRExecuteBB = BasicBlock::Create( Ctx, "worker_state_machine.parallel_region.execute", Kernel, StateMachineEndParallelBB); @@ -3275,7 +3638,7 @@ struct AAKernelInfoFunction : AAKernelInfo { // Check if we need to compare the pointer at all or if we can just // call the parallel region function. Value *IsPR; - if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) { + if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) { Instruction *CmpI = ICmpInst::Create( ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion, "worker.check_parallel_region", StateMachineIfCascadeCurrentBB); @@ -3339,8 +3702,21 @@ struct AAKernelInfoFunction : AAKernelInfo { if (llvm::all_of(Objects, [](const Value *Obj) { return isa<AllocaInst>(Obj); })) return true; + // Check for AAHeapToStack moved objects which must not be guarded. + auto &HS = A.getAAFor<AAHeapToStack>( + *this, IRPosition::function(*I.getFunction()), + DepClassTy::OPTIONAL); + if (llvm::all_of(Objects, [&HS](const Value *Obj) { + auto *CB = dyn_cast<CallBase>(Obj); + if (!CB) + return false; + return HS.isAssumedHeapToStack(*CB); + })) { + return true; + } } - // For now we give up on everything but stores. + + // Insert instruction that needs guarding. SPMDCompatibilityTracker.insert(&I); return true; }; @@ -3354,9 +3730,13 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!IsKernelEntry) { updateReachingKernelEntries(A); updateParallelLevels(A); + + if (!ParallelLevels.isValidState()) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); } // Callback to check a call instruction. + bool AllParallelRegionStatesWereFixed = true; bool AllSPMDStatesWereFixed = true; auto CheckCallInst = [&](Instruction &I) { auto &CB = cast<CallBase>(I); @@ -3364,13 +3744,37 @@ struct AAKernelInfoFunction : AAKernelInfo { *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); getState() ^= CBAA.getState(); AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint(); + AllParallelRegionStatesWereFixed &= + CBAA.ReachedKnownParallelRegions.isAtFixpoint(); + AllParallelRegionStatesWereFixed &= + CBAA.ReachedUnknownParallelRegions.isAtFixpoint(); return true; }; bool UsedAssumedInformationInCheckCallInst = false; if (!A.checkForAllCallLikeInstructions( - CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) + CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) { + LLVM_DEBUG(dbgs() << TAG + << "Failed to visit all call-like instructions!\n";); return indicatePessimisticFixpoint(); + } + + // If we haven't used any assumed information for the reached parallel + // region states we can fix it. + if (!UsedAssumedInformationInCheckCallInst && + AllParallelRegionStatesWereFixed) { + ReachedKnownParallelRegions.indicateOptimisticFixpoint(); + ReachedUnknownParallelRegions.indicateOptimisticFixpoint(); + } + + // If we are sure there are no parallel regions in the kernel we do not + // want SPMD mode. + if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() && + ReachedKnownParallelRegions.isAtFixpoint() && + ReachedUnknownParallelRegions.isValidState() && + ReachedKnownParallelRegions.isValidState() && + !mayContainParallelRegion()) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); // If we haven't used any assumed information for the SPMD state we can fix // it. @@ -3469,14 +3873,14 @@ struct AAKernelInfoCallSite : AAKernelInfo { CallBase &CB = cast<CallBase>(getAssociatedValue()); Function *Callee = getAssociatedFunction(); - // Helper to lookup an assumption string. - auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) { - return Fn && hasAssumption(*Fn, AssumptionStr); - }; + auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>( + *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); // Check for SPMD-mode assumptions. - if (HasAssumption(Callee, "ompx_spmd_amenable")) + if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) { SPMDCompatibilityTracker.indicateOptimisticFixpoint(); + indicateOptimisticFixpoint(); + } // First weed out calls we do not care about, that is readonly/readnone // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a @@ -3498,14 +3902,16 @@ struct AAKernelInfoCallSite : AAKernelInfo { // Unknown callees might contain parallel regions, except if they have // an appropriate assumption attached. - if (!(HasAssumption(Callee, "omp_no_openmp") || - HasAssumption(Callee, "omp_no_parallelism"))) + if (!(AssumptionAA.hasAssumption("omp_no_openmp") || + AssumptionAA.hasAssumption("omp_no_parallelism"))) ReachedUnknownParallelRegions.insert(&CB); // If SPMDCompatibilityTracker is not fixed, we need to give up on the // idea we can run something unknown in SPMD-mode. - if (!SPMDCompatibilityTracker.isAtFixpoint()) + if (!SPMDCompatibilityTracker.isAtFixpoint()) { + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); + } // We have updated the state for this unknown call properly, there won't // be any change so we indicate a fixpoint. @@ -3521,6 +3927,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { switch (RF) { // All the functions we know are compatible with SPMD mode. case OMPRTL___kmpc_is_spmd_exec_mode: + case OMPRTL___kmpc_distribute_static_fini: case OMPRTL___kmpc_for_static_fini: case OMPRTL___kmpc_global_thread_num: case OMPRTL___kmpc_get_hardware_num_threads_in_block: @@ -3531,6 +3938,10 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_end_master: case OMPRTL___kmpc_barrier: break; + case OMPRTL___kmpc_distribute_static_init_4: + case OMPRTL___kmpc_distribute_static_init_4u: + case OMPRTL___kmpc_distribute_static_init_8: + case OMPRTL___kmpc_distribute_static_init_8u: case OMPRTL___kmpc_for_static_init_4: case OMPRTL___kmpc_for_static_init_4u: case OMPRTL___kmpc_for_static_init_8: @@ -3548,6 +3959,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPScheduleType::DistributeChunked: break; default: + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); break; }; @@ -3580,7 +3992,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { return; default: // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, - // generally. + // generally. However, they do not hide parallel regions. SPMDCompatibilityTracker.insert(&CB); break; } @@ -3700,6 +4112,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { } void initialize(Attributor &A) override { + if (DisableOpenMPOptFolding) + indicatePessimisticFixpoint(); + Function *Callee = getAssociatedFunction(); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); @@ -3756,11 +4171,24 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { ChangeStatus Changed = ChangeStatus::UNCHANGED; if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) { - Instruction &CB = *getCtxI(); - A.changeValueAfterManifest(CB, **SimplifiedValue); - A.deleteAfterManifest(CB); + Instruction &I = *getCtxI(); + A.changeValueAfterManifest(I, **SimplifiedValue); + A.deleteAfterManifest(I); - LLVM_DEBUG(dbgs() << TAG << "Folding runtime call: " << CB << " with " + CallBase *CB = dyn_cast<CallBase>(&I); + auto Remark = [&](OptimizationRemark OR) { + if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue)) + return OR << "Replacing OpenMP runtime call " + << CB->getCalledFunction()->getName() << " with " + << ore::NV("FoldedValue", C->getZExtValue()) << "."; + return OR << "Replacing OpenMP runtime call " + << CB->getCalledFunction()->getName() << "."; + }; + + if (CB && EnableVerboseRemarks) + A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark); + + LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with " << **SimplifiedValue << "\n"); Changed = ChangeStatus::CHANGED; @@ -3994,7 +4422,6 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { DepClassTy::NONE, /* ForceUpdate */ false, /* UpdateAfterInit */ false); - registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level); @@ -4027,7 +4454,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F)); return false; }; - GlobalizationRFI.foreachUse(SCC, CreateAA); + if (!DisableOpenMPOptDeglobalization) + GlobalizationRFI.foreachUse(SCC, CreateAA); // Create an ExecutionDomain AA for every function and a HeapToStack AA for // every function if there is a device kernel. @@ -4039,7 +4467,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { continue; A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F)); - A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F)); + if (!DisableOpenMPOptDeglobalization) + A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F)); for (auto &I : instructions(*F)) { if (auto *LI = dyn_cast<LoadInst>(&I)) { @@ -4234,12 +4663,24 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { SetVector<Function *> Functions(SCC.begin(), SCC.end()); OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels); - unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; + unsigned MaxFixpointIterations = + (isOpenMPDevice(M)) ? SetFixpointIterations : 32; Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false, MaxFixpointIterations, OREGetter, DEBUG_TYPE); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(true); + + // Optionally inline device functions for potentially better performance. + if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M)) + for (Function &F : M) + if (!F.isDeclaration() && !Kernels.contains(&F) && + !F.hasFnAttribute(Attribute::NoInline)) + F.addFnAttr(Attribute::AlwaysInline); + + if (PrintModuleAfterOptimizations) + LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M); + if (Changed) return PreservedAnalyses::none(); @@ -4286,12 +4727,17 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, /*CGSCC*/ Functions, Kernels); - unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; + unsigned MaxFixpointIterations = + (isOpenMPDevice(M)) ? SetFixpointIterations : 32; Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, MaxFixpointIterations, OREGetter, DEBUG_TYPE); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(false); + + if (PrintModuleAfterOptimizations) + LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M); + if (Changed) return PreservedAnalyses::none(); @@ -4352,12 +4798,18 @@ struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass { Allocator, /*CGSCC*/ Functions, Kernels); - unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; + unsigned MaxFixpointIterations = + (isOpenMPDevice(M)) ? SetFixpointIterations : 32; Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, MaxFixpointIterations, OREGetter, DEBUG_TYPE); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); - return OMPOpt.run(false); + bool Result = OMPOpt.run(false); + + if (PrintModuleAfterOptimizations) + LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M); + + return Result; } bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp index d517de38ace3..7402e399a88a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -441,9 +441,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo( }; auto BBProfileCount = [BFI](BasicBlock *BB) { - return BFI->getBlockProfileCount(BB) - ? BFI->getBlockProfileCount(BB).getValue() - : 0; + return BFI->getBlockProfileCount(BB).getValueOr(0); }; // Use the same computeBBInlineCost function to compute the cost savings of @@ -1413,7 +1411,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); uint64_t CalleeEntryCountV = - (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); + (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); bool AnyInline = false; for (User *User : Users) { @@ -1461,8 +1459,8 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { if (AnyInline) { Cloner.IsFunctionInlined = true; if (CalleeEntryCount) - Cloner.OrigFunc->setEntryCount( - CalleeEntryCount.setCount(CalleeEntryCountV)); + Cloner.OrigFunc->setEntryCount(Function::ProfileCount( + CalleeEntryCountV, CalleeEntryCount->getType())); OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); OrigFuncORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index aa916345954d..74f68531b89a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -437,6 +437,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions + // The matrix extension can introduce large vector operations early, which can + // benefit from running vector-combine early on. + if (EnableMatrix) + MPM.add(createVectorCombinePass()); + // Begin the loop pass pipeline. if (EnableSimpleLoopUnswitch) { // The simple loop unswitch pass relies on separate cleanup passes. Schedule @@ -1012,7 +1017,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty())); // Propage constant function arguments by specializing the functions. - if (EnableFunctionSpecialization) + if (EnableFunctionSpecialization && OptLevel > 2) PM.add(createFunctionSpecializationPass()); // Propagate constants at call sites into the functions they call. This diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp index 081398a390fa..5779553ee732 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp @@ -135,6 +135,7 @@ PreservedAnalyses FunctionSpecializationPass::run(Module &M, return PA; } +namespace { struct FunctionSpecializationLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid FunctionSpecializationLegacyPass() : ModulePass(ID) {} @@ -175,6 +176,7 @@ struct FunctionSpecializationLegacyPass : public ModulePass { return runFunctionSpecialization(M, DL, GetTLI, GetTTI, GetAC, GetAnalysis); } }; +} // namespace char FunctionSpecializationLegacyPass::ID = 0; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index 55b88ac14da5..bae9a1e27e75 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, if (CalleeName.empty()) return getHottestChildContext(CallSite); - uint32_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = nodeHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); if (It != AllChildContext.end()) return &It->second; @@ -64,8 +64,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { ContextTrieNode &ContextTrieNode::moveToChildContext( const LineLocation &CallSite, ContextTrieNode &&NodeToMove, - StringRef ContextStrToRemove, bool DeleteNode) { - uint32_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite); + uint32_t ContextFramesToRemove, bool DeleteNode) { + uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite); assert(!AllChildContext.count(Hash) && "Node to remove must exist"); LineLocation OldCallSite = NodeToMove.CallSiteLoc; ContextTrieNode &OldParentContext = *NodeToMove.getParentContext(); @@ -86,10 +86,10 @@ ContextTrieNode &ContextTrieNode::moveToChildContext( FunctionSamples *FSamples = Node->getFunctionSamples(); if (FSamples) { - FSamples->getContext().promoteOnPath(ContextStrToRemove); + FSamples->getContext().promoteOnPath(ContextFramesToRemove); FSamples->getContext().setState(SyntheticContext); - LLVM_DEBUG(dbgs() << " Context promoted to: " << FSamples->getContext() - << "\n"); + LLVM_DEBUG(dbgs() << " Context promoted to: " + << FSamples->getContext().toString() << "\n"); } for (auto &It : Node->getAllChildContext()) { @@ -108,12 +108,12 @@ ContextTrieNode &ContextTrieNode::moveToChildContext( void ContextTrieNode::removeChildContext(const LineLocation &CallSite, StringRef CalleeName) { - uint32_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = nodeHash(CalleeName, CallSite); // Note this essentially calls dtor and destroys that child context AllChildContext.erase(Hash); } -std::map<uint32_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() { +std::map<uint64_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() { return AllChildContext; } @@ -127,6 +127,15 @@ void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) { FuncSamples = FSamples; } +Optional<uint32_t> ContextTrieNode::getFunctionSize() const { return FuncSize; } + +void ContextTrieNode::addFunctionSize(uint32_t FSize) { + if (!FuncSize.hasValue()) + FuncSize = 0; + + FuncSize = FuncSize.getValue() + FSize; +} + LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; } ContextTrieNode *ContextTrieNode::getParentContext() const { @@ -137,9 +146,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) { ParentContext = Parent; } -void ContextTrieNode::dump() { +void ContextTrieNode::dumpNode() { dbgs() << "Node: " << FuncName << "\n" << " Callsite: " << CallSiteLoc << "\n" + << " Size: " << FuncSize << "\n" << " Children:\n"; for (auto &It : AllChildContext) { @@ -147,20 +157,38 @@ void ContextTrieNode::dump() { } } -uint32_t ContextTrieNode::nodeHash(StringRef ChildName, +void ContextTrieNode::dumpTree() { + dbgs() << "Context Profile Tree:\n"; + std::queue<ContextTrieNode *> NodeQueue; + NodeQueue.push(this); + + while (!NodeQueue.empty()) { + ContextTrieNode *Node = NodeQueue.front(); + NodeQueue.pop(); + Node->dumpNode(); + + for (auto &It : Node->getAllChildContext()) { + ContextTrieNode *ChildNode = &It.second; + NodeQueue.push(ChildNode); + } + } +} + +uint64_t ContextTrieNode::nodeHash(StringRef ChildName, const LineLocation &Callsite) { // We still use child's name for child hash, this is // because for children of root node, we don't have // different line/discriminator, and we'll rely on name // to differentiate children. - uint32_t NameHash = std::hash<std::string>{}(ChildName.str()); - uint32_t LocId = (Callsite.LineOffset << 16) | Callsite.Discriminator; + uint64_t NameHash = std::hash<std::string>{}(ChildName.str()); + uint64_t LocId = + (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator; return NameHash + (LocId << 5) + LocId; } ContextTrieNode *ContextTrieNode::getOrCreateChildContext( const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) { - uint32_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = nodeHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); if (It != AllChildContext.end()) { assert(It->second.getFuncName() == CalleeName && @@ -177,13 +205,16 @@ ContextTrieNode *ContextTrieNode::getOrCreateChildContext( // Profiler tracker than manages profiles and its associated context SampleContextTracker::SampleContextTracker( - StringMap<FunctionSamples> &Profiles) { + SampleProfileMap &Profiles, + const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap) + : GUIDToFuncNameMap(GUIDToFuncNameMap) { for (auto &FuncSample : Profiles) { FunctionSamples *FSamples = &FuncSample.second; - SampleContext Context(FuncSample.first(), RawContext); - LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n"); + SampleContext Context = FuncSample.first; + LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context.toString() + << "\n"); if (!Context.isBaseContext()) - FuncToCtxtProfiles[Context.getNameWithoutContext()].push_back(FSamples); + FuncToCtxtProfiles[Context.getName()].insert(FSamples); ContextTrieNode *NewNode = getOrCreateContextPath(Context, true); assert(!NewNode->getFunctionSamples() && "New node can't have sample profile"); @@ -200,6 +231,10 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, return nullptr; CalleeName = FunctionSamples::getCanonicalFnName(CalleeName); + // Convert real function names to MD5 names, if the input profile is + // MD5-based. + std::string FGUID; + CalleeName = getRepInFormat(CalleeName, FunctionSamples::UseMD5, FGUID); // For indirect call, CalleeName will be empty, in which case the context // profile for callee with largest total samples will be returned. @@ -207,7 +242,8 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, if (CalleeContext) { FunctionSamples *FSamples = CalleeContext->getFunctionSamples(); LLVM_DEBUG(if (FSamples) { - dbgs() << " Callee context found: " << FSamples->getContext() << "\n"; + dbgs() << " Callee context found: " << FSamples->getContext().toString() + << "\n"; }); return FSamples; } @@ -285,6 +321,11 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func, FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name, bool MergeContext) { LLVM_DEBUG(dbgs() << "Getting base profile for function: " << Name << "\n"); + // Convert real function names to MD5 names, if the input profile is + // MD5-based. + std::string FGUID; + Name = getRepInFormat(Name, FunctionSamples::UseMD5, FGUID); + // Base profile is top-level node (child of root node), so try to retrieve // existing top-level node for given function first. If it exists, it could be // that we've merged base profile before, or there's actually context-less @@ -299,14 +340,14 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name, // into base profile. for (auto *CSamples : FuncToCtxtProfiles[Name]) { SampleContext &Context = CSamples->getContext(); - ContextTrieNode *FromNode = getContextFor(Context); - if (FromNode == Node) - continue; - // Skip inlined context profile and also don't re-merge any context if (Context.hasState(InlinedContext) || Context.hasState(MergedContext)) continue; + ContextTrieNode *FromNode = getContextFor(Context); + if (FromNode == Node) + continue; + ContextTrieNode &ToNode = promoteMergeContextSamplesTree(*FromNode); assert((!Node || Node == &ToNode) && "Expect only one base profile"); Node = &ToNode; @@ -324,7 +365,7 @@ void SampleContextTracker::markContextSamplesInlined( const FunctionSamples *InlinedSamples) { assert(InlinedSamples && "Expect non-null inlined samples"); LLVM_DEBUG(dbgs() << "Marking context profile as inlined: " - << InlinedSamples->getContext() << "\n"); + << InlinedSamples->getContext().toString() << "\n"); InlinedSamples->getContext().setState(InlinedContext); } @@ -376,30 +417,23 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples(); assert(FromSamples && "Shouldn't promote a context without profile"); LLVM_DEBUG(dbgs() << " Found context tree root to promote: " - << FromSamples->getContext() << "\n"); + << FromSamples->getContext().toString() << "\n"); assert(!FromSamples->getContext().hasState(InlinedContext) && "Shouldn't promote inlined context profile"); - StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext(); + uint32_t ContextFramesToRemove = + FromSamples->getContext().getContextFrames().size() - 1; return promoteMergeContextSamplesTree(NodeToPromo, RootContext, - ContextStrToRemove); + ContextFramesToRemove); } -void SampleContextTracker::dump() { - dbgs() << "Context Profile Tree:\n"; - std::queue<ContextTrieNode *> NodeQueue; - NodeQueue.push(&RootContext); - - while (!NodeQueue.empty()) { - ContextTrieNode *Node = NodeQueue.front(); - NodeQueue.pop(); - Node->dump(); +void SampleContextTracker::dump() { RootContext.dumpTree(); } - for (auto &It : Node->getAllChildContext()) { - ContextTrieNode *ChildNode = &It.second; - NodeQueue.push(ChildNode); - } - } +StringRef SampleContextTracker::getFuncNameFor(ContextTrieNode *Node) const { + if (!FunctionSamples::UseMD5) + return Node->getFuncName(); + assert(GUIDToFuncNameMap && "GUIDToFuncNameMap needs to be populated first"); + return GUIDToFuncNameMap->lookup(std::stoull(Node->getFuncName().data())); } ContextTrieNode * @@ -444,11 +478,22 @@ ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) { RootName = PrevDIL->getScope()->getSubprogram()->getName(); S.push_back(std::make_pair(LineLocation(0, 0), RootName)); + // Convert real function names to MD5 names, if the input profile is + // MD5-based. + std::list<std::string> MD5Names; + if (FunctionSamples::UseMD5) { + for (auto &Location : S) { + MD5Names.emplace_back(); + getRepInFormat(Location.second, FunctionSamples::UseMD5, MD5Names.back()); + Location.second = MD5Names.back(); + } + } + ContextTrieNode *ContextNode = &RootContext; int I = S.size(); while (--I >= 0 && ContextNode) { LineLocation &CallSite = S[I].first; - StringRef &CalleeName = S[I].second; + StringRef CalleeName = S[I].second; ContextNode = ContextNode->getChildContext(CallSite, CalleeName); } @@ -462,27 +507,18 @@ ContextTrieNode * SampleContextTracker::getOrCreateContextPath(const SampleContext &Context, bool AllowCreate) { ContextTrieNode *ContextNode = &RootContext; - StringRef ContextRemain = Context; - StringRef ChildContext; - StringRef CalleeName; LineLocation CallSiteLoc(0, 0); - while (ContextNode && !ContextRemain.empty()) { - auto ContextSplit = SampleContext::splitContextString(ContextRemain); - ChildContext = ContextSplit.first; - ContextRemain = ContextSplit.second; - LineLocation NextCallSiteLoc(0, 0); - SampleContext::decodeContextString(ChildContext, CalleeName, - NextCallSiteLoc); - + for (auto &Callsite : Context.getContextFrames()) { // Create child node at parent line/disc location if (AllowCreate) { ContextNode = - ContextNode->getOrCreateChildContext(CallSiteLoc, CalleeName); + ContextNode->getOrCreateChildContext(CallSiteLoc, Callsite.FuncName); } else { - ContextNode = ContextNode->getChildContext(CallSiteLoc, CalleeName); + ContextNode = + ContextNode->getChildContext(CallSiteLoc, Callsite.FuncName); } - CallSiteLoc = NextCallSiteLoc; + CallSiteLoc = Callsite.Location; } assert((!AllowCreate || ContextNode) && @@ -502,7 +538,7 @@ ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) { void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode, - StringRef ContextStrToRemove) { + uint32_t ContextFramesToRemove) { FunctionSamples *FromSamples = FromNode.getFunctionSamples(); FunctionSamples *ToSamples = ToNode.getFunctionSamples(); if (FromSamples && ToSamples) { @@ -510,19 +546,21 @@ void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode, ToSamples->merge(*FromSamples); ToSamples->getContext().setState(SyntheticContext); FromSamples->getContext().setState(MergedContext); + if (FromSamples->getContext().hasAttribute(ContextShouldBeInlined)) + ToSamples->getContext().setAttribute(ContextShouldBeInlined); } else if (FromSamples) { // Transfer FromSamples from FromNode to ToNode ToNode.setFunctionSamples(FromSamples); FromSamples->getContext().setState(SyntheticContext); - FromSamples->getContext().promoteOnPath(ContextStrToRemove); + FromSamples->getContext().promoteOnPath(ContextFramesToRemove); FromNode.setFunctionSamples(nullptr); } } ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent, - StringRef ContextStrToRemove) { - assert(!ContextStrToRemove.empty() && "Context to remove can't be empty"); + uint32_t ContextFramesToRemove) { + assert(ContextFramesToRemove && "Context to remove can't be empty"); // Ignore call site location if destination is top level under root LineLocation NewCallSiteLoc = LineLocation(0, 0); @@ -540,21 +578,21 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( // Do not delete node to move from its parent here because // caller is iterating over children of that parent node. ToNode = &ToNodeParent.moveToChildContext( - NewCallSiteLoc, std::move(FromNode), ContextStrToRemove, false); + NewCallSiteLoc, std::move(FromNode), ContextFramesToRemove, false); } else { // Destination node exists, merge samples for the context tree - mergeContextNode(FromNode, *ToNode, ContextStrToRemove); + mergeContextNode(FromNode, *ToNode, ContextFramesToRemove); LLVM_DEBUG({ if (ToNode->getFunctionSamples()) dbgs() << " Context promoted and merged to: " - << ToNode->getFunctionSamples()->getContext() << "\n"; + << ToNode->getFunctionSamples()->getContext().toString() << "\n"; }); // Recursively promote and merge children for (auto &It : FromNode.getAllChildContext()) { ContextTrieNode &FromChildNode = It.second; promoteMergeContextSamplesTree(FromChildNode, *ToNode, - ContextStrToRemove); + ContextFramesToRemove); } // Remove children once they're all merged diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp index 8e9c79fc7bbb..a961c47a7501 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -143,6 +143,12 @@ static cl::opt<bool> ProfileSampleAccurate( "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. ")); +static cl::opt<bool> ProfileSampleBlockAccurate( + "profile-sample-block-accurate", cl::Hidden, cl::init(false), + cl::desc("If the sample profile is accurate, we will mark all un-sampled " + "branches and calls as having 0 samples. Otherwise, treat " + "them conservatively as unknown. ")); + static cl::opt<bool> ProfileAccurateForSymsInList( "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore, cl::init(true), @@ -214,6 +220,16 @@ static cl::opt<bool> CallsitePrioritizedInline( cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported.")); +static cl::opt<bool> UsePreInlinerDecision( + "sample-profile-use-preinliner", cl::Hidden, cl::ZeroOrMore, + cl::init(false), + cl::desc("Use the preinliner decisions stored in profile context.")); + +static cl::opt<bool> AllowRecursiveInline( + "sample-profile-recursive-inline", cl::Hidden, cl::ZeroOrMore, + cl::init(false), + cl::desc("Allow sample loader inliner to inline recursive calls.")); + static cl::opt<std::string> ProfileInlineReplayFile( "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc( @@ -221,6 +237,50 @@ static cl::opt<std::string> ProfileInlineReplayFile( "by inlining from sample profile loader."), cl::Hidden); +static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope( + "sample-profile-inline-replay-scope", + cl::init(ReplayInlinerSettings::Scope::Function), + cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", + "Replay on functions that have remarks associated " + "with them (default)"), + clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", + "Replay on the entire module")), + cl::desc("Whether inline replay should be applied to the entire " + "Module or just the Functions (default) that are present as " + "callers in remarks during sample profile inlining."), + cl::Hidden); + +static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback( + "sample-profile-inline-replay-fallback", + cl::init(ReplayInlinerSettings::Fallback::Original), + cl::values( + clEnumValN( + ReplayInlinerSettings::Fallback::Original, "Original", + "All decisions not in replay send to original advisor (default)"), + clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, + "AlwaysInline", "All decisions not in replay are inlined"), + clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", + "All decisions not in replay are not inlined")), + cl::desc("How sample profile inline replay treats sites that don't come " + "from the replay. Original: defers to original advisor, " + "AlwaysInline: inline all sites not in replay, NeverInline: " + "inline no sites not in replay"), + cl::Hidden); + +static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat( + "sample-profile-inline-replay-format", + cl::init(CallSiteFormat::Format::LineColumnDiscriminator), + cl::values( + clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), + clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", + "<Line Number>:<Column Number>"), + clEnumValN(CallSiteFormat::Format::LineDiscriminator, + "LineDiscriminator", "<Line Number>.<Discriminator>"), + clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, + "LineColumnDiscriminator", + "<Line Number>:<Column Number>.<Discriminator> (default)")), + cl::desc("How sample profile inline replay file is formatted"), cl::Hidden); + static cl::opt<unsigned> MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::ZeroOrMore, @@ -358,10 +418,10 @@ public: std::function<AssumptionCache &(Function &)> GetAssumptionCache, std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo, std::function<const TargetLibraryInfo &(Function &)> GetTLI) - : SampleProfileLoaderBaseImpl(std::string(Name)), + : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)), GetAC(std::move(GetAssumptionCache)), GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), - RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {} + LTOPhase(LTOPhase) {} bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); bool runOnModule(Module &M, ModuleAnalysisManager *AM, @@ -377,7 +437,7 @@ protected: findFunctionSamples(const Instruction &I) const override; std::vector<const FunctionSamples *> findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; - void findExternalInlineCandidate(const FunctionSamples *Samples, + void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples, DenseSet<GlobalValue::GUID> &InlinedGUIDs, const StringMap<Function *> &SymbolMap, uint64_t Threshold); @@ -385,8 +445,11 @@ protected: bool tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); + bool inlineHotFunctions(Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs); + Optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB); + bool getExternalInlineAdvisorShouldInline(CallBase &CB); InlineCost shouldInlineCandidate(InlineCandidate &Candidate); bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); bool @@ -417,9 +480,6 @@ protected: /// Profile tracker for different context. std::unique_ptr<SampleContextTracker> ContextTracker; - /// Name of the profile remapping file to load. - std::string RemappingFilename; - /// Flag indicating whether input profile is context-sensitive bool ProfileIsCS = false; @@ -464,7 +524,7 @@ protected: bool ProfAccForSymsInList; // External inline advisor used to replay inline decision from remarks. - std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor; + std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor; // A pseudo probe helper to correlate the imported sample counts. std::unique_ptr<PseudoProbeManager> ProbeManager; @@ -953,8 +1013,24 @@ void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( } void SampleProfileLoader::findExternalInlineCandidate( - const FunctionSamples *Samples, DenseSet<GlobalValue::GUID> &InlinedGUIDs, + CallBase *CB, const FunctionSamples *Samples, + DenseSet<GlobalValue::GUID> &InlinedGUIDs, const StringMap<Function *> &SymbolMap, uint64_t Threshold) { + + // If ExternalInlineAdvisor wants to inline an external function + // make sure it's imported + if (CB && getExternalInlineAdvisorShouldInline(*CB)) { + // Samples may not exist for replayed function, if so + // just add the direct GUID and move on + if (!Samples) { + InlinedGUIDs.insert( + FunctionSamples::getGUID(CB->getCalledFunction()->getName())); + return; + } + // Otherwise, drop the threshold to import everything that we can + Threshold = 0; + } + assert(Samples && "expect non-null caller profile"); // For AutoFDO profile, retrieve candidate profiles by walking over @@ -975,14 +1051,21 @@ void SampleProfileLoader::findExternalInlineCandidate( // For CSSPGO profile, retrieve candidate profile by walking over the // trie built for context profile. Note that also take call targets // even if callee doesn't have a corresponding context profile. - if (!CalleeSample || CalleeSample->getEntrySamples() < Threshold) + if (!CalleeSample) + continue; + + // If pre-inliner decision is used, honor that for importing as well. + bool PreInline = + UsePreInlinerDecision && + CalleeSample->getContext().hasAttribute(ContextShouldBeInlined); + if (!PreInline && CalleeSample->getEntrySamples() < Threshold) continue; StringRef Name = CalleeSample->getFuncName(); Function *Func = SymbolMap.lookup(Name); // Add to the import list only when it's defined out of module. if (!Func || Func->isDeclaration()) - InlinedGUIDs.insert(FunctionSamples::getGUID(Name)); + InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName())); // Import hot CallTargets, which may not be available in IR because full // profile annotation cannot be done until backend compilation in ThinLTO. @@ -992,7 +1075,7 @@ void SampleProfileLoader::findExternalInlineCandidate( StringRef CalleeName = CalleeSample->getFuncName(TS.getKey()); const Function *Callee = SymbolMap.lookup(CalleeName); if (!Callee || Callee->isDeclaration()) - InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeName)); + InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey())); } // Import hot child context profile associted with callees. Note that this @@ -1042,16 +1125,20 @@ bool SampleProfileLoader::inlineHotFunctions( for (auto &I : BB.getInstList()) { const FunctionSamples *FS = nullptr; if (auto *CB = dyn_cast<CallBase>(&I)) { - if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) { - assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && - "GUIDToFuncNameMap has to be populated"); - AllCandidates.push_back(CB); - if (FS->getEntrySamples() > 0 || ProfileIsCS) - LocalNotInlinedCallSites.try_emplace(CB, FS); - if (callsiteIsHot(FS, PSI, ProfAccForSymsInList)) - Hot = true; - else if (shouldInlineColdCallee(*CB)) - ColdCandidates.push_back(CB); + if (!isa<IntrinsicInst>(I)) { + if ((FS = findCalleeFunctionSamples(*CB))) { + assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && + "GUIDToFuncNameMap has to be populated"); + AllCandidates.push_back(CB); + if (FS->getEntrySamples() > 0 || ProfileIsCS) + LocalNotInlinedCallSites.try_emplace(CB, FS); + if (callsiteIsHot(FS, PSI, ProfAccForSymsInList)) + Hot = true; + else if (shouldInlineColdCallee(*CB)) + ColdCandidates.push_back(CB); + } else if (getExternalInlineAdvisorShouldInline(*CB)) { + AllCandidates.push_back(CB); + } } } } @@ -1078,7 +1165,7 @@ bool SampleProfileLoader::inlineHotFunctions( for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { uint64_t SumOrigin = Sum; if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap, + findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap, PSI->getOrCompHotCountThreshold()); continue; } @@ -1098,8 +1185,8 @@ bool SampleProfileLoader::inlineHotFunctions( LocalChanged = true; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findExternalInlineCandidate(findCalleeFunctionSamples(*I), InlinedGUIDs, - SymbolMap, + findExternalInlineCandidate(I, findCalleeFunctionSamples(*I), + InlinedGUIDs, SymbolMap, PSI->getOrCompHotCountThreshold()); } } @@ -1184,8 +1271,8 @@ bool SampleProfileLoader::tryInlineCandidate( *CalledFunction); // The call to InlineFunction erases I, so we can't pass it here. - emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, - true, CSINLINE_DEBUG); + emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, + *BB->getParent(), Cost, true, CSINLINE_DEBUG); // Now populate the list of newly exposed call sites. if (InlinedCallSites) { @@ -1228,7 +1315,9 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, // Find the callee's profile. For indirect call, find hottest target profile. const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB); - if (!CalleeSamples) + // If ExternalInlineAdvisor wants to inline this site, do so even + // if Samples are not present. + if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB)) return false; float Factor = 1.0; @@ -1247,19 +1336,34 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, return true; } -InlineCost -SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { +Optional<InlineCost> +SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { std::unique_ptr<InlineAdvice> Advice = nullptr; if (ExternalInlineAdvisor) { - Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr); - if (!Advice->isInliningRecommended()) { - Advice->recordUnattemptedInlining(); - return InlineCost::getNever("not previously inlined"); + Advice = ExternalInlineAdvisor->getAdvice(CB); + if (Advice) { + if (!Advice->isInliningRecommended()) { + Advice->recordUnattemptedInlining(); + return InlineCost::getNever("not previously inlined"); + } + Advice->recordInlining(); + return InlineCost::getAlways("previously inlined"); } - Advice->recordInlining(); - return InlineCost::getAlways("previously inlined"); } + return {}; +} + +bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { + Optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); + return Cost ? !!Cost.getValue() : false; +} + +InlineCost +SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { + if (Optional<InlineCost> ReplayCost = + getExternalInlineAdvisorCost(*Candidate.CallInstr)) + return ReplayCost.getValue(); // Adjust threshold based on call site hotness, only do this for callsite // prioritized inliner because otherwise cost-benefit check is done earlier. int SampleThreshold = SampleColdCallSiteThreshold; @@ -1274,7 +1378,9 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { assert(Callee && "Expect a definition for inline candidate of direct call"); InlineParams Params = getInlineParams(); + // We will ignore the threshold from inline cost, so always get full cost. Params.ComputeFullInlineCost = true; + Params.AllowRecursiveCall = AllowRecursiveInline; // Checks if there is anything in the reachable portion of the callee at // this callsite that makes this inlining potentially illegal. Need to // set ComputeFullInlineCost, otherwise getInlineCost may return early @@ -1288,6 +1394,25 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { if (Cost.isNever() || Cost.isAlways()) return Cost; + // With CSSPGO, the preinliner in llvm-profgen can estimate global inline + // decisions based on hotness as well as accurate function byte sizes for + // given context using function/inlinee sizes from previous build. It + // stores the decision in profile, and also adjust/merge context profile + // aiming at better context-sensitive post-inline profile quality, assuming + // all inline decision estimates are going to be honored by compiler. Here + // we replay that inline decision under `sample-profile-use-preinliner`. + // Note that we don't need to handle negative decision from preinliner as + // context profile for not inlined calls are merged by preinliner already. + if (UsePreInlinerDecision && Candidate.CalleeSamples) { + // Once two node are merged due to promotion, we're losing some context + // so the original context-sensitive preinliner decision should be ignored + // for SyntheticContext. + SampleContext &Context = Candidate.CalleeSamples->getContext(); + if (!Context.hasState(SyntheticContext) && + Context.hasAttribute(ContextShouldBeInlined)) + return InlineCost::getAlways("preinliner"); + } + // For old FDO inliner, we inline the call site as long as cost is not // "Never". The cost-benefit check is done earlier. if (!CallsitePrioritizedInline) { @@ -1357,7 +1482,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( for (const auto *FS : CalleeSamples) { // TODO: Consider disable pre-lTO ICP for MonoLTO as well if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap, + findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap, PSI->getOrCompHotCountThreshold()); continue; } @@ -1405,8 +1530,9 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( Changed = true; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findExternalInlineCandidate(Candidate.CalleeSamples, InlinedGUIDs, - SymbolMap, PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(I, findCalleeFunctionSamples(*I), + InlinedGUIDs, SymbolMap, + PSI->getOrCompHotCountThreshold()); } } @@ -1494,7 +1620,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { {static_cast<uint32_t>(BlockWeights[BB])})); } } - } else if (OverwriteExistingWeights) { + } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { // Set profile metadata (possibly annotated by LTO prelink) to zero or // clear it for cold code. for (auto &I : BB->getInstList()) { @@ -1792,11 +1918,13 @@ bool SampleProfileLoader::doInitialization(Module &M, } if (FAM && !ProfileInlineReplayFile.empty()) { - ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>( - M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile, + ExternalInlineAdvisor = getReplayInlineAdvisor( + M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, + ReplayInlinerSettings{ProfileInlineReplayFile, + ProfileInlineReplayScope, + ProfileInlineReplayFallback, + {ProfileInlineReplayFormat}}, /*EmitRemarks=*/false); - if (!ExternalInlineAdvisor->areReplayRemarksLoaded()) - ExternalInlineAdvisor.reset(); } // Apply tweaks if context-sensitive profile is available. @@ -1810,13 +1938,21 @@ bool SampleProfileLoader::doInitialization(Module &M, if (!CallsitePrioritizedInline.getNumOccurrences()) CallsitePrioritizedInline = true; + // For CSSPGO, use preinliner decision by default when available. + if (!UsePreInlinerDecision.getNumOccurrences()) + UsePreInlinerDecision = true; + + // For CSSPGO, we also allow recursive inline to best use context profile. + if (!AllowRecursiveInline.getNumOccurrences()) + AllowRecursiveInline = true; + // Enable iterative-BFI by default for CSSPGO. if (!UseIterativeBFIInference.getNumOccurrences()) UseIterativeBFIInference = true; // Tracker for profiles under different context - ContextTracker = - std::make_unique<SampleContextTracker>(Reader->getProfiles()); + ContextTracker = std::make_unique<SampleContextTracker>( + Reader->getProfiles(), &GUIDToFuncNameMap); } // Load pseudo probe descriptors for probe-based function samples. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp index 08d316337ef5..21395460bccb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -415,9 +415,7 @@ void PseudoProbeUpdatePass::runOnFunction(Function &F, FunctionAnalysisManager &FAM) { BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); auto BBProfileCount = [&BFI](BasicBlock *BB) { - return BFI.getBlockProfileCount(BB) - ? BFI.getBlockProfileCount(BB).getValue() - : 0; + return BFI.getBlockProfileCount(BB).getValueOr(0); }; // Collect the sum of execution weight for each probe. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index 655a7a404951..0f2412dce1c9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -30,23 +30,20 @@ static bool stripDeadPrototypes(Module &M) { bool MadeChange = false; // Erase dead function prototypes. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { - Function *F = &*I++; + for (Function &F : llvm::make_early_inc_range(M)) { // Function must be a prototype and unused. - if (F->isDeclaration() && F->use_empty()) { - F->eraseFromParent(); + if (F.isDeclaration() && F.use_empty()) { + F.eraseFromParent(); ++NumDeadPrototypes; MadeChange = true; } } // Erase dead global var prototypes. - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ) { - GlobalVariable *GV = &*I++; + for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { // Global must be a prototype and unused. - if (GV->isDeclaration() && GV->use_empty()) - GV->eraseFromParent(); + if (GV.isDeclaration() && GV.use_empty()) + GV.eraseFromParent(); } // Return an indication of whether we changed anything or not. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp index 168740a1158e..9d4e9464f361 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -214,13 +214,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues); for (GlobalVariable &GV : M.globals()) { - if (GV.hasLocalLinkage() && llvmUsedValues.count(&GV) == 0) + if (GV.hasLocalLinkage() && !llvmUsedValues.contains(&GV)) if (!PreserveDbgInfo || !GV.getName().startswith("llvm.dbg")) GV.setName(""); // Internal symbols can't participate in linkage } for (Function &I : M) { - if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0) + if (I.hasLocalLinkage() && !llvmUsedValues.contains(&I)) if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg")) I.setName(""); // Internal symbols can't participate in linkage if (auto *Symtab = I.getValueSymbolTable()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index eea848d3eb2f..0cc1b37844f6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -164,8 +164,7 @@ void simplifyExternals(Module &M) { FunctionType *EmptyFT = FunctionType::get(Type::getVoidTy(M.getContext()), false); - for (auto I = M.begin(), E = M.end(); I != E;) { - Function &F = *I++; + for (Function &F : llvm::make_early_inc_range(M)) { if (F.isDeclaration() && F.use_empty()) { F.eraseFromParent(); continue; @@ -181,16 +180,15 @@ void simplifyExternals(Module &M) { F.getAddressSpace(), "", &M); NewF->copyAttributesFrom(&F); // Only copy function attribtues. - NewF->setAttributes( - AttributeList::get(M.getContext(), AttributeList::FunctionIndex, - F.getAttributes().getFnAttributes())); + NewF->setAttributes(AttributeList::get(M.getContext(), + AttributeList::FunctionIndex, + F.getAttributes().getFnAttrs())); NewF->takeName(&F); F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType())); F.eraseFromParent(); } - for (auto I = M.global_begin(), E = M.global_end(); I != E;) { - GlobalVariable &GV = *I++; + for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { if (GV.isDeclaration() && GV.use_empty()) { GV.eraseFromParent(); continue; @@ -325,7 +323,8 @@ void splitAndWriteThinLTOBitcode( return true; if (auto *F = dyn_cast<Function>(GV)) return EligibleVirtualFns.count(F); - if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) + if (auto *GVar = + dyn_cast_or_null<GlobalVariable>(GV->getAliaseeObject())) return HasTypeMetadata(GVar); return false; })); @@ -354,7 +353,7 @@ void splitAndWriteThinLTOBitcode( // Remove all globals with type metadata, globals with comdats that live in // MergedM, and aliases pointing to such globals from the thin LTO module. filterModule(&M, [&](const GlobalValue *GV) { - if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) + if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getAliaseeObject())) if (HasTypeMetadata(GVar)) return false; if (const auto *C = GV->getComdat()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 7a8946110785..61054e7ae46f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1288,7 +1288,7 @@ void DevirtModule::tryICallBranchFunnel( M.getDataLayout().getProgramAddressSpace(), "branch_funnel", &M); } - JT->addAttribute(1, Attribute::Nest); + JT->addParamAttr(0, Attribute::Nest); std::vector<Value *> JTArgs; JTArgs.push_back(JT->arg_begin()); @@ -1361,10 +1361,10 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, M.getContext(), ArrayRef<Attribute>{Attribute::get( M.getContext(), Attribute::Nest)})); for (unsigned I = 0; I + 2 < Attrs.getNumAttrSets(); ++I) - NewArgAttrs.push_back(Attrs.getParamAttributes(I)); + NewArgAttrs.push_back(Attrs.getParamAttrs(I)); NewCS->setAttributes( - AttributeList::get(M.getContext(), Attrs.getFnAttributes(), - Attrs.getRetAttributes(), NewArgAttrs)); + AttributeList::get(M.getContext(), Attrs.getFnAttrs(), + Attrs.getRetAttrs(), NewArgAttrs)); CB.replaceAllUsesWith(NewCS); CB.eraseFromParent(); @@ -1786,10 +1786,8 @@ void DevirtModule::scanTypeTestUsers( // points to a member of the type identifier %md. Group calls by (type ID, // offset) pair (effectively the identity of the virtual function) and store // to CallSlots. - for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end(); - I != E;) { - auto CI = dyn_cast<CallInst>(I->getUser()); - ++I; + for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) { + auto *CI = dyn_cast<CallInst>(U.getUser()); if (!CI) continue; @@ -1858,11 +1856,8 @@ void DevirtModule::scanTypeTestUsers( void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test); - for (auto I = TypeCheckedLoadFunc->use_begin(), - E = TypeCheckedLoadFunc->use_end(); - I != E;) { - auto CI = dyn_cast<CallInst>(I->getUser()); - ++I; + for (Use &U : llvm::make_early_inc_range(TypeCheckedLoadFunc->uses())) { + auto *CI = dyn_cast<CallInst>(U.getUser()); if (!CI) continue; |