diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-20 14:16:56 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-20 14:16:56 +0000 |
commit | 2cab237b5dbfe1b3e9c7aa7a3c02d2b98fcf7462 (patch) | |
tree | 524fe828571f81358bba62fdb6d04c6e5e96a2a4 /contrib/llvm/lib/Analysis/InlineCost.cpp | |
parent | 6c7828a2807ea5e50c79ca42dbedf2b589ce63b2 (diff) | |
parent | 044eb2f6afba375a914ac9d8024f8f5142bb912e (diff) |
Merge llvm trunk r321017 to contrib/llvm.
Notes
Notes:
svn path=/projects/clang600-import/; revision=327023
Diffstat (limited to 'contrib/llvm/lib/Analysis/InlineCost.cpp')
-rw-r--r-- | contrib/llvm/lib/Analysis/InlineCost.cpp | 671 |
1 files changed, 548 insertions, 123 deletions
diff --git a/contrib/llvm/lib/Analysis/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp index 35693666aa03..fba96c8976a6 100644 --- a/contrib/llvm/lib/Analysis/InlineCost.cpp +++ b/contrib/llvm/lib/Analysis/InlineCost.cpp @@ -21,9 +21,11 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -66,12 +68,27 @@ static cl::opt<int> cl::ZeroOrMore, cl::desc("Threshold for hot callsites ")); +static cl::opt<int> LocallyHotCallSiteThreshold( + "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, + cl::desc("Threshold for locally hot callsites ")); + static cl::opt<int> ColdCallSiteRelFreq( "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information.")); +static cl::opt<int> HotCallSiteRelFreq( + "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, + cl::desc("Minimum block frequency, expressed as a multiple of caller's " + "entry frequency, for a callsite to be hot in the absence of " + "profile information.")); + +static cl::opt<bool> OptComputeFullInlineCost( + "inline-cost-full", cl::Hidden, cl::init(false), + cl::desc("Compute the full inline cost of a call site even when the cost " + "exceeds the threshold.")); + namespace { class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { @@ -96,6 +113,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { // Cache the DataLayout since we use it a lot. const DataLayout &DL; + /// The OptimizationRemarkEmitter available for this compilation. + OptimizationRemarkEmitter *ORE; + /// The candidate callsite being analyzed. Please do not use this to do /// analysis in the caller function; we want the inline cost query to be /// easily cacheable. Instead, use the cover function paramHasAttr. @@ -106,6 +126,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { int Threshold; int Cost; + bool ComputeFullInlineCost; bool IsCallerRecursive; bool IsRecursiveCall; @@ -119,8 +140,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// Number of bytes allocated statically by the callee. uint64_t AllocatedSize; unsigned NumInstructions, NumVectorInstructions; - int FiftyPercentVectorBonus, TenPercentVectorBonus; - int VectorBonus; + int VectorBonus, TenPercentVectorBonus; + // Bonus to be applied when the callee has only one reachable basic block. + int SingleBBBonus; /// While we walk the potentially-inlined instructions, we build up and /// maintain a mapping of simplified values specific to this callsite. The @@ -143,15 +165,32 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// Keep track of values which map to a pointer base and constant offset. DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; + /// Keep track of dead blocks due to the constant arguments. + SetVector<BasicBlock *> DeadBlocks; + + /// The mapping of the blocks to their known unique successors due to the + /// constant arguments. + DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; + + /// Model the elimination of repeated loads that is expected to happen + /// whenever we simplify away the stores that would otherwise cause them to be + /// loads. + bool EnableLoadElimination; + SmallPtrSet<Value *, 16> LoadAddrSet; + int LoadEliminationCost; + // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); bool lookupSROAArgAndCost(Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt); void disableSROA(DenseMap<Value *, int>::iterator CostIt); void disableSROA(Value *V); + void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, int InstructionCost); + void disableLoadElimination(); bool isGEPFree(GetElementPtrInst &GEP); + bool canFoldInboundsGEP(GetElementPtrInst &I); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); bool simplifyCallSite(Function *F, CallSite CS); template <typename Callable> @@ -181,6 +220,10 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// Return true if \p CS is a cold callsite. bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); + /// Return a higher threshold if \p CS is a hot callsite. + Optional<int> getHotCallSiteThreshold(CallSite CS, + BlockFrequencyInfo *CallerBFI); + // Custom analysis routines. bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); @@ -206,6 +249,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { bool visitCastInst(CastInst &I); bool visitUnaryInstruction(UnaryInstruction &I); bool visitCmpInst(CmpInst &I); + bool visitAnd(BinaryOperator &I); + bool visitOr(BinaryOperator &I); bool visitSub(BinaryOperator &I); bool visitBinaryOperator(BinaryOperator &I); bool visitLoad(LoadInst &I); @@ -215,6 +260,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { bool visitCallSite(CallSite CS); bool visitReturnInst(ReturnInst &RI); bool visitBranchInst(BranchInst &BI); + bool visitSelectInst(SelectInst &SI); bool visitSwitchInst(SwitchInst &SI); bool visitIndirectBrInst(IndirectBrInst &IBI); bool visitResumeInst(ResumeInst &RI); @@ -226,17 +272,19 @@ public: CallAnalyzer(const TargetTransformInfo &TTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, - ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, - const InlineParams &Params) + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, + Function &Callee, CallSite CSArg, const InlineParams &Params) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), - PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), + PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), - Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), + Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || + Params.ComputeFullInlineCost || ORE), + IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), - NumVectorInstructions(0), FiftyPercentVectorBonus(0), - TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), + NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), + EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), NumConstantPtrDiffs(0), NumInstructionsSimplified(0), SROACostSavings(0), SROACostSavingsLost(0) {} @@ -294,6 +342,7 @@ void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { SROACostSavings -= CostIt->second; SROACostSavingsLost += CostIt->second; SROAArgCosts.erase(CostIt); + disableLoadElimination(); } /// \brief If 'V' maps to a SROA candidate, disable SROA for it. @@ -311,6 +360,13 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, SROACostSavings += InstructionCost; } +void CallAnalyzer::disableLoadElimination() { + if (EnableLoadElimination) { + Cost += LoadEliminationCost; + EnableLoadElimination = false; + } +} + /// \brief Accumulate a constant GEP offset into an APInt if possible. /// /// Returns false if unable to compute the offset for any reason. Respects any @@ -348,15 +404,14 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { /// /// Respects any simplified values known during the analysis of this callsite. bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { - SmallVector<Value *, 4> Indices; + SmallVector<Value *, 4> Operands; + Operands.push_back(GEP.getOperand(0)); for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) - Indices.push_back(SimpleOp); + Operands.push_back(SimpleOp); else - Indices.push_back(*I); - return TargetTransformInfo::TCC_Free == - TTI.getGEPCost(GEP.getSourceElementType(), GEP.getPointerOperand(), - Indices); + Operands.push_back(*I); + return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); } bool CallAnalyzer::visitAlloca(AllocaInst &I) { @@ -391,52 +446,125 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) { } bool CallAnalyzer::visitPHI(PHINode &I) { - // FIXME: We should potentially be tracking values through phi nodes, - // especially when they collapse to a single value due to deleted CFG edges - // during inlining. - // FIXME: We need to propagate SROA *disabling* through phi nodes, even // though we don't want to propagate it's bonuses. The idea is to disable // SROA if it *might* be used in an inappropriate manner. // Phi nodes are always zero-cost. - return true; -} -bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { - Value *SROAArg; - DenseMap<Value *, int>::iterator CostIt; - bool SROACandidate = - lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); + APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits()); + bool CheckSROA = I.getType()->isPointerTy(); - // Try to fold GEPs of constant-offset call site argument pointers. This - // requires target data and inbounds GEPs. - if (I.isInBounds()) { - // Check if we have a base + offset for the pointer. - Value *Ptr = I.getPointerOperand(); - std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); - if (BaseAndOffset.first) { - // Check if the offset of this GEP is constant, and if so accumulate it - // into Offset. - if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { - // Non-constant GEPs aren't folded, and disable SROA. - if (SROACandidate) - disableSROA(CostIt); - return isGEPFree(I); - } + // Track the constant or pointer with constant offset we've seen so far. + Constant *FirstC = nullptr; + std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; + Value *FirstV = nullptr; - // Add the result as a new mapping to Base + Offset. - ConstantOffsetPtrs[&I] = BaseAndOffset; + for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = I.getIncomingBlock(i); + // If the incoming block is dead, skip the incoming block. + if (DeadBlocks.count(Pred)) + continue; + // If the parent block of phi is not the known successor of the incoming + // block, skip the incoming block. + BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; + if (KnownSuccessor && KnownSuccessor != I.getParent()) + continue; + + Value *V = I.getIncomingValue(i); + // If the incoming value is this phi itself, skip the incoming value. + if (&I == V) + continue; + + Constant *C = dyn_cast<Constant>(V); + if (!C) + C = SimplifiedValues.lookup(V); - // Also handle SROA candidates here, we already know that the GEP is - // all-constant indexed. - if (SROACandidate) - SROAArgValues[&I] = SROAArg; + std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; + if (!C && CheckSROA) + BaseAndOffset = ConstantOffsetPtrs.lookup(V); + if (!C && !BaseAndOffset.first) + // The incoming value is neither a constant nor a pointer with constant + // offset, exit early. + return true; + + if (FirstC) { + if (FirstC == C) + // If we've seen a constant incoming value before and it is the same + // constant we see this time, continue checking the next incoming value. + continue; + // Otherwise early exit because we either see a different constant or saw + // a constant before but we have a pointer with constant offset this time. + return true; + } + + if (FirstV) { + // The same logic as above, but check pointer with constant offset here. + if (FirstBaseAndOffset == BaseAndOffset) + continue; return true; } + + if (C) { + // This is the 1st time we've seen a constant, record it. + FirstC = C; + continue; + } + + // The remaining case is that this is the 1st time we've seen a pointer with + // constant offset, record it. + FirstV = V; + FirstBaseAndOffset = BaseAndOffset; } + // Check if we can map phi to a constant. + if (FirstC) { + SimplifiedValues[&I] = FirstC; + return true; + } + + // Check if we can map phi to a pointer with constant offset. + if (FirstBaseAndOffset.first) { + ConstantOffsetPtrs[&I] = FirstBaseAndOffset; + + Value *SROAArg; + DenseMap<Value *, int>::iterator CostIt; + if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) + SROAArgValues[&I] = SROAArg; + } + + return true; +} + +/// \brief Check we can fold GEPs of constant-offset call site argument pointers. +/// This requires target data and inbounds GEPs. +/// +/// \return true if the specified GEP can be folded. +bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { + // Check if we have a base + offset for the pointer. + std::pair<Value *, APInt> BaseAndOffset = + ConstantOffsetPtrs.lookup(I.getPointerOperand()); + if (!BaseAndOffset.first) + return false; + + // Check if the offset of this GEP is constant, and if so accumulate it + // into Offset. + if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) + return false; + + // Add the result as a new mapping to Base + Offset. + ConstantOffsetPtrs[&I] = BaseAndOffset; + + return true; +} + +bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { + Value *SROAArg; + DenseMap<Value *, int>::iterator CostIt; + bool SROACandidate = + lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); + // Lambda to check whether a GEP's indices are all constant. auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) @@ -445,7 +573,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { return true; }; - if (IsGEPOffsetConstant(I)) { + if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { if (SROACandidate) SROAArgValues[&I] = SROAArg; @@ -643,15 +771,17 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) { bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's coldness is // determined based on that. - if (PSI->hasProfileSummary()) + if (PSI && PSI->hasProfileSummary()) return PSI->isColdCallSite(CS, CallerBFI); + + // Otherwise we need BFI to be available. if (!CallerBFI) return false; - // In the absence of global profile summary, determine if the callsite is cold - // relative to caller's entry. We could potentially cache the computation of - // scaled entry frequency, but the added complexity is not worth it unless - // this scaling shows up high in the profiles. + // Determine if the callsite is cold relative to caller's entry. We could + // potentially cache the computation of scaled entry frequency, but the added + // complexity is not worth it unless this scaling shows up high in the + // profiles. const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); auto CallSiteBB = CS.getInstruction()->getParent(); auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); @@ -660,6 +790,34 @@ bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { return CallSiteFreq < CallerEntryFreq * ColdProb; } +Optional<int> +CallAnalyzer::getHotCallSiteThreshold(CallSite CS, + BlockFrequencyInfo *CallerBFI) { + + // If global profile summary is available, then callsite's hotness is + // determined based on that. + if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) + return Params.HotCallSiteThreshold; + + // Otherwise we need BFI to be available and to have a locally hot callsite + // threshold. + if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) + return None; + + // Determine if the callsite is hot relative to caller's entry. We could + // potentially cache the computation of scaled entry frequency, but the added + // complexity is not worth it unless this scaling shows up high in the + // profiles. + auto CallSiteBB = CS.getInstruction()->getParent(); + auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); + auto CallerEntryFreq = CallerBFI->getEntryFreq(); + if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) + return Params.LocallyHotCallSiteThreshold; + + // Otherwise treat it normally. + return None; +} + void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // If no size growth is allowed for this inlining, set Threshold to 0. if (!allowSizeGrowth(CS)) { @@ -679,11 +837,49 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { return B ? std::max(A, B.getValue()) : A; }; + // Various bonus percentages. These are multiplied by Threshold to get the + // bonus values. + // SingleBBBonus: This bonus is applied if the callee has a single reachable + // basic block at the given callsite context. This is speculatively applied + // and withdrawn if more than one basic block is seen. + // + // Vector bonuses: We want to more aggressively inline vector-dense kernels + // and apply this bonus based on the percentage of vector instructions. A + // bonus is applied if the vector instructions exceed 50% and half that amount + // is applied if it exceeds 10%. Note that these bonuses are some what + // arbitrary and evolved over time by accident as much as because they are + // principled bonuses. + // FIXME: It would be nice to base the bonus values on something more + // scientific. + // + // LstCallToStaticBonus: This large bonus is applied to ensure the inlining + // of the last call to a static function as inlining such functions is + // guaranteed to reduce code size. + // + // These bonus percentages may be set to 0 based on properties of the caller + // and the callsite. + int SingleBBBonusPercent = 50; + int VectorBonusPercent = 150; + int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; + + // Lambda to set all the above bonus and bonus percentages to 0. + auto DisallowAllBonuses = [&]() { + SingleBBBonusPercent = 0; + VectorBonusPercent = 0; + LastCallToStaticBonus = 0; + }; + // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available // and reduce the threshold if the caller has the necessary attribute. - if (Caller->optForMinSize()) + if (Caller->optForMinSize()) { Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); - else if (Caller->optForSize()) + // For minsize, we want to disable the single BB bonus and the vector + // bonuses, but not the last-call-to-static bonus. Inlining the last call to + // a static function will, at the minimum, eliminate the parameter setup and + // call/return instructions. + SingleBBBonusPercent = 0; + VectorBonusPercent = 0; + } else if (Caller->optForSize()) Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); // Adjust the threshold based on inlinehint attribute and profile based @@ -691,35 +887,48 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { if (!Caller->optForMinSize()) { if (Callee.hasFnAttribute(Attribute::InlineHint)) Threshold = MaxIfValid(Threshold, Params.HintThreshold); - if (PSI) { - BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; - // FIXME: After switching to the new passmanager, simplify the logic below - // by checking only the callsite hotness/coldness. The check for CallerBFI - // exists only because we do not have BFI available with the old PM. - // - // Use callee's hotness information only if we have no way of determining - // callsite's hotness information. Callsite hotness can be determined if - // sample profile is used (which adds hotness metadata to calls) or if - // caller's BlockFrequencyInfo is available. - if (CallerBFI || PSI->hasSampleProfile()) { - if (PSI->isHotCallSite(CS, CallerBFI)) { - DEBUG(dbgs() << "Hot callsite.\n"); - Threshold = Params.HotCallSiteThreshold.getValue(); - } else if (isColdCallSite(CS, CallerBFI)) { - DEBUG(dbgs() << "Cold callsite.\n"); - Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); - } - } else { - if (PSI->isFunctionEntryHot(&Callee)) { - DEBUG(dbgs() << "Hot callee.\n"); - // If callsite hotness can not be determined, we may still know - // that the callee is hot and treat it as a weaker hint for threshold - // increase. - Threshold = MaxIfValid(Threshold, Params.HintThreshold); - } else if (PSI->isFunctionEntryCold(&Callee)) { - DEBUG(dbgs() << "Cold callee.\n"); - Threshold = MinIfValid(Threshold, Params.ColdThreshold); - } + + // FIXME: After switching to the new passmanager, simplify the logic below + // by checking only the callsite hotness/coldness as we will reliably + // have local profile information. + // + // Callsite hotness and coldness can be determined if sample profile is + // used (which adds hotness metadata to calls) or if caller's + // BlockFrequencyInfo is available. + BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; + auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); + if (!Caller->optForSize() && HotCallSiteThreshold) { + DEBUG(dbgs() << "Hot callsite.\n"); + // FIXME: This should update the threshold only if it exceeds the + // current threshold, but AutoFDO + ThinLTO currently relies on this + // behavior to prevent inlining of hot callsites during ThinLTO + // compile phase. + Threshold = HotCallSiteThreshold.getValue(); + } else if (isColdCallSite(CS, CallerBFI)) { + DEBUG(dbgs() << "Cold callsite.\n"); + // Do not apply bonuses for a cold callsite including the + // LastCallToStatic bonus. While this bonus might result in code size + // reduction, it can cause the size of a non-cold caller to increase + // preventing it from being inlined. + DisallowAllBonuses(); + Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); + } else if (PSI) { + // Use callee's global profile information only if we have no way of + // determining this via callsite information. + if (PSI->isFunctionEntryHot(&Callee)) { + DEBUG(dbgs() << "Hot callee.\n"); + // If callsite hotness can not be determined, we may still know + // that the callee is hot and treat it as a weaker hint for threshold + // increase. + Threshold = MaxIfValid(Threshold, Params.HintThreshold); + } else if (PSI->isFunctionEntryCold(&Callee)) { + DEBUG(dbgs() << "Cold callee.\n"); + // Do not apply bonuses for a cold callee including the + // LastCallToStatic bonus. While this bonus might result in code size + // reduction, it can cause the size of a non-cold caller to increase + // preventing it from being inlined. + DisallowAllBonuses(); + Threshold = MinIfValid(Threshold, Params.ColdThreshold); } } } @@ -727,6 +936,17 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // Finally, take the target-specific inlining threshold multiplier into // account. Threshold *= TTI.getInliningThresholdMultiplier(); + + SingleBBBonus = Threshold * SingleBBBonusPercent / 100; + VectorBonus = Threshold * VectorBonusPercent / 100; + + bool OnlyOneCallAndLocalLinkage = + F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); + // If there is only one call of the function, and it has internal linkage, + // the cost of inlining it drops dramatically. It may seem odd to update + // Cost in updateThreshold, but the bonus depends on the logic in this method. + if (OnlyOneCallAndLocalLinkage) + Cost -= LastCallToStaticBonus; } bool CallAnalyzer::visitCmpInst(CmpInst &I) { @@ -784,6 +1004,34 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) { return false; } +bool CallAnalyzer::visitOr(BinaryOperator &I) { + // This is necessary because the generic simplify instruction only works if + // both operands are constants. + for (unsigned i = 0; i < 2; ++i) { + if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( + SimplifiedValues.lookup(I.getOperand(i)))) + if (C->isAllOnesValue()) { + SimplifiedValues[&I] = C; + return true; + } + } + return Base::visitOr(I); +} + +bool CallAnalyzer::visitAnd(BinaryOperator &I) { + // This is necessary because the generic simplify instruction only works if + // both operands are constants. + for (unsigned i = 0; i < 2; ++i) { + if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( + SimplifiedValues.lookup(I.getOperand(i)))) + if (C->isZero()) { + SimplifiedValues[&I] = C; + return true; + } + } + return Base::visitAnd(I); +} + bool CallAnalyzer::visitSub(BinaryOperator &I) { // Try to handle a special case: we can fold computing the difference of two // constant-related pointers. @@ -845,6 +1093,15 @@ bool CallAnalyzer::visitLoad(LoadInst &I) { disableSROA(CostIt); } + // If the data is already loaded from this address and hasn't been clobbered + // by any stores or calls, this load is likely to be redundant and can be + // eliminated. + if (EnableLoadElimination && + !LoadAddrSet.insert(I.getPointerOperand()).second) { + LoadEliminationCost += InlineConstants::InstrCost; + return true; + } + return false; } @@ -860,6 +1117,15 @@ bool CallAnalyzer::visitStore(StoreInst &I) { disableSROA(CostIt); } + // The store can potentially clobber loads and prevent repeated loads from + // being eliminated. + // FIXME: + // 1. We can probably keep an initial set of eliminatable loads substracted + // from the cost even when we finally see a store. We just need to disable + // *further* accumulation of elimination savings. + // 2. We should probably at some point thread MemorySSA for the callee into + // this and then use that to actually compute *really* precise savings. + disableLoadElimination(); return false; } @@ -942,6 +1208,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { switch (II->getIntrinsicID()) { default: + if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) + disableLoadElimination(); return Base::visitCallSite(CS); case Intrinsic::load_relative: @@ -952,6 +1220,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { case Intrinsic::memset: case Intrinsic::memcpy: case Intrinsic::memmove: + disableLoadElimination(); // SROA can usually chew through these intrinsics, but they aren't free. return false; case Intrinsic::localescape: @@ -960,7 +1229,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { } } - if (F == CS.getInstruction()->getParent()->getParent()) { + if (F == CS.getInstruction()->getFunction()) { // This flag will fully abort the analysis, so don't bother with anything // else. IsRecursiveCall = true; @@ -978,6 +1247,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { Cost += InlineConstants::CallPenalty; } + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); } @@ -992,8 +1263,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { // Next, check if this happens to be an indirect function call to a known // function in this inline context. If not, we've done all we can. Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); - if (!F) + if (!F) { + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); + } // If we have a constant that we are calling as a function, we can peer // through it and see the function target. This happens not infrequently @@ -1002,7 +1276,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { // out. Pretend to inline the function, with a custom threshold. auto IndirectCallParams = Params; IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS, + CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, IndirectCallParams); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the @@ -1010,6 +1284,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { Cost -= std::max(0, CA.getThreshold() - CA.getCost()); } + if (!F->onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); } @@ -1030,6 +1306,87 @@ bool CallAnalyzer::visitBranchInst(BranchInst &BI) { SimplifiedValues.lookup(BI.getCondition())); } +bool CallAnalyzer::visitSelectInst(SelectInst &SI) { + bool CheckSROA = SI.getType()->isPointerTy(); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + + Constant *TrueC = dyn_cast<Constant>(TrueVal); + if (!TrueC) + TrueC = SimplifiedValues.lookup(TrueVal); + Constant *FalseC = dyn_cast<Constant>(FalseVal); + if (!FalseC) + FalseC = SimplifiedValues.lookup(FalseVal); + Constant *CondC = + dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); + + if (!CondC) { + // Select C, X, X => X + if (TrueC == FalseC && TrueC) { + SimplifiedValues[&SI] = TrueC; + return true; + } + + if (!CheckSROA) + return Base::visitSelectInst(SI); + + std::pair<Value *, APInt> TrueBaseAndOffset = + ConstantOffsetPtrs.lookup(TrueVal); + std::pair<Value *, APInt> FalseBaseAndOffset = + ConstantOffsetPtrs.lookup(FalseVal); + if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { + ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; + + Value *SROAArg; + DenseMap<Value *, int>::iterator CostIt; + if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) + SROAArgValues[&SI] = SROAArg; + return true; + } + + return Base::visitSelectInst(SI); + } + + // Select condition is a constant. + Value *SelectedV = CondC->isAllOnesValue() + ? TrueVal + : (CondC->isNullValue()) ? FalseVal : nullptr; + if (!SelectedV) { + // Condition is a vector constant that is not all 1s or all 0s. If all + // operands are constants, ConstantExpr::getSelect() can handle the cases + // such as select vectors. + if (TrueC && FalseC) { + if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { + SimplifiedValues[&SI] = C; + return true; + } + } + return Base::visitSelectInst(SI); + } + + // Condition is either all 1s or all 0s. SI can be simplified. + if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { + SimplifiedValues[&SI] = SelectedC; + return true; + } + + if (!CheckSROA) + return true; + + std::pair<Value *, APInt> BaseAndOffset = + ConstantOffsetPtrs.lookup(SelectedV); + if (BaseAndOffset.first) { + ConstantOffsetPtrs[&SI] = BaseAndOffset; + + Value *SROAArg; + DenseMap<Value *, int>::iterator CostIt; + if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) + SROAArgValues[&SI] = SROAArg; + } + + return true; +} + bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { // We model unconditional switches as free, see the comments on handling // branches. @@ -1062,7 +1419,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { std::min((int64_t)CostUpperBound, (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); - if (CostLowerBound > Threshold) { + if (CostLowerBound > Threshold && !ComputeFullInlineCost) { Cost = CostLowerBound; return false; } @@ -1211,21 +1568,39 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB, else Cost += InlineConstants::InstrCost; + using namespace ore; // If the visit this instruction detected an uninlinable pattern, abort. if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || - HasIndirectBr || HasFrameEscape) + HasIndirectBr || HasFrameEscape) { + if (ORE) + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", + CandidateCS.getInstruction()) + << NV("Callee", &F) + << " has uninlinable pattern and cost is not fully computed"; + }); return false; + } // If the caller is a recursive function then we don't want to inline // functions which allocate a lot of stack space because it would increase // the caller stack usage dramatically. if (IsCallerRecursive && - AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) + AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { + if (ORE) + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", + CandidateCS.getInstruction()) + << NV("Callee", &F) + << " is recursive and allocates too much stack space. Cost is " + "not fully computed"; + }); return false; + } // Check if we've past the maximum possible threshold so we don't spin in // huge basic blocks that will never inline. - if (Cost > Threshold) + if (Cost >= Threshold && !ComputeFullInlineCost) return false; } @@ -1270,6 +1645,44 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); } +/// \brief Find dead blocks due to deleted CFG edges during inlining. +/// +/// If we know the successor of the current block, \p CurrBB, has to be \p +/// NextBB, the other successors of \p CurrBB are dead if these successors have +/// no live incoming CFG edges. If one block is found to be dead, we can +/// continue growing the dead block list by checking the successors of the dead +/// blocks to see if all their incoming edges are dead or not. +void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { + auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { + // A CFG edge is dead if the predecessor is dead or the predessor has a + // known successor which is not the one under exam. + return (DeadBlocks.count(Pred) || + (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); + }; + + auto IsNewlyDead = [&](BasicBlock *BB) { + // If all the edges to a block are dead, the block is also dead. + return (!DeadBlocks.count(BB) && + llvm::all_of(predecessors(BB), + [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); + }; + + for (BasicBlock *Succ : successors(CurrBB)) { + if (Succ == NextBB || !IsNewlyDead(Succ)) + continue; + SmallVector<BasicBlock *, 4> NewDead; + NewDead.push_back(Succ); + while (!NewDead.empty()) { + BasicBlock *Dead = NewDead.pop_back_val(); + if (DeadBlocks.insert(Dead)) + // Continue growing the dead block lists. + for (BasicBlock *S : successors(Dead)) + if (IsNewlyDead(S)) + NewDead.push_back(S); + } + } +} + /// \brief Analyze a call site for potential inlining. /// /// Returns true if inlining this call is viable, and false if it is not @@ -1296,51 +1709,35 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { // Update the threshold based on callsite properties updateThreshold(CS, F); - FiftyPercentVectorBonus = 3 * Threshold / 2; - TenPercentVectorBonus = 3 * Threshold / 4; - - // Track whether the post-inlining function would have more than one basic - // block. A single basic block is often intended for inlining. Balloon the - // threshold by 50% until we pass the single-BB phase. - bool SingleBB = true; - int SingleBBBonus = Threshold / 2; - // Speculatively apply all possible bonuses to Threshold. If cost exceeds // this Threshold any time, and cost cannot decrease, we can stop processing // the rest of the function body. - Threshold += (SingleBBBonus + FiftyPercentVectorBonus); + Threshold += (SingleBBBonus + VectorBonus); // Give out bonuses for the callsite, as the instructions setting them up // will be gone after inlining. Cost -= getCallsiteCost(CS, DL); - // If there is only one call of the function, and it has internal linkage, - // the cost of inlining it drops dramatically. - bool OnlyOneCallAndLocalLinkage = - F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); - if (OnlyOneCallAndLocalLinkage) - Cost -= InlineConstants::LastCallToStaticBonus; - // If this function uses the coldcc calling convention, prefer not to inline // it. if (F.getCallingConv() == CallingConv::Cold) Cost += InlineConstants::ColdccPenalty; // Check if we're done. This can happen due to bonuses and penalties. - if (Cost > Threshold) + if (Cost >= Threshold && !ComputeFullInlineCost) return false; if (F.empty()) return true; - Function *Caller = CS.getInstruction()->getParent()->getParent(); + Function *Caller = CS.getInstruction()->getFunction(); // Check if the caller function is recursive itself. for (User *U : Caller->users()) { CallSite Site(U); if (!Site) continue; Instruction *I = Site.getInstruction(); - if (I->getParent()->getParent() == Caller) { + if (I->getFunction() == Caller) { IsCallerRecursive = true; break; } @@ -1388,11 +1785,12 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { BBSetVector; BBSetVector BBWorklist; BBWorklist.insert(&F.getEntryBlock()); + bool SingleBB = true; // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { // Bail out the moment we cross the threshold. This means we'll under-count // the cost, but only when undercounting doesn't matter. - if (Cost > Threshold) + if (Cost >= Threshold && !ComputeFullInlineCost) break; BasicBlock *BB = BBWorklist[Idx]; @@ -1422,7 +1820,10 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { Value *Cond = BI->getCondition(); if (ConstantInt *SimpleCond = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { - BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); + BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); + BBWorklist.insert(NextBB); + KnownSuccessors[BB] = NextBB; + findDeadBlocks(BB, NextBB); continue; } } @@ -1430,7 +1831,10 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { Value *Cond = SI->getCondition(); if (ConstantInt *SimpleCond = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { - BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor()); + BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); + BBWorklist.insert(NextBB); + KnownSuccessors[BB] = NextBB; + findDeadBlocks(BB, NextBB); continue; } } @@ -1452,6 +1856,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { } } + bool OnlyOneCallAndLocalLinkage = + F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); // If this is a noduplicate call, we can still inline as long as // inlining this would cause the removal of the caller (so the instruction // is not actually duplicated, just moved). @@ -1462,9 +1868,9 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { // subtract the excess bonus, if any, from the Threshold before // comparing against Cost. if (NumVectorInstructions <= NumInstructions / 10) - Threshold -= FiftyPercentVectorBonus; + Threshold -= VectorBonus; else if (NumVectorInstructions <= NumInstructions / 2) - Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); + Threshold -= VectorBonus/2; return Cost < std::max(1, Threshold); } @@ -1482,6 +1888,7 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() { DEBUG_PRINT_STAT(NumInstructions); DEBUG_PRINT_STAT(SROACostSavings); DEBUG_PRINT_STAT(SROACostSavingsLost); + DEBUG_PRINT_STAT(LoadEliminationCost); DEBUG_PRINT_STAT(ContainsNoDuplicateCall); DEBUG_PRINT_STAT(Cost); DEBUG_PRINT_STAT(Threshold); @@ -1534,9 +1941,9 @@ InlineCost llvm::getInlineCost( CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, - ProfileSummaryInfo *PSI) { + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, - GetAssumptionCache, GetBFI, PSI); + GetAssumptionCache, GetBFI, PSI, ORE); } InlineCost llvm::getInlineCost( @@ -1544,7 +1951,7 @@ InlineCost llvm::getInlineCost( TargetTransformInfo &CalleeTTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, - ProfileSummaryInfo *PSI) { + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { // Cannot inline indirect calls. if (!Callee) @@ -1560,11 +1967,12 @@ InlineCost llvm::getInlineCost( // Never inline functions with conflicting attributes (unless callee has // always-inline attribute). - if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) + Function *Caller = CS.getCaller(); + if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) return llvm::InlineCost::getNever(); // Don't inline this call if the caller has the optnone attribute. - if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) + if (Caller->hasFnAttribute(Attribute::OptimizeNone)) return llvm::InlineCost::getNever(); // Don't inline functions which can be interposed at link-time. Don't inline @@ -1576,9 +1984,9 @@ InlineCost llvm::getInlineCost( return llvm::InlineCost::getNever(); DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() - << "...\n"); + << "... (caller:" << Caller->getName() << ")\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS, + CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, Params); bool ShouldInline = CA.analyzeCall(CS); @@ -1652,6 +2060,16 @@ InlineParams llvm::getInlineParams(int Threshold) { // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. Params.HotCallSiteThreshold = HotCallSiteThreshold; + // If the -locally-hot-callsite-threshold is explicitly specified, use it to + // populate LocallyHotCallSiteThreshold. Later, we populate + // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if + // we know that optimization level is O3 (in the getInlineParams variant that + // takes the opt and size levels). + // FIXME: Remove this check (and make the assignment unconditional) after + // addressing size regression issues at O2. + if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) + Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; + // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. Params.ColdCallSiteThreshold = ColdCallSiteThreshold; @@ -1691,5 +2109,12 @@ static int computeThresholdFromOptLevels(unsigned OptLevel, } InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { - return getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); + auto Params = + getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); + // At O3, use the value of -locally-hot-callsite-threshold option to populate + // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only + // when it is specified explicitly. + if (OptLevel > 2) + Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; + return Params; } |