diff options
Diffstat (limited to 'llvm/lib/Analysis/InlineCost.cpp')
-rw-r--r-- | llvm/lib/Analysis/InlineCost.cpp | 479 |
1 files changed, 373 insertions, 106 deletions
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index de83a48aad16..33d714406d7f 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -24,9 +24,11 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/AssemblyAnnotationWriter.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -38,6 +40,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -46,6 +49,15 @@ using namespace llvm; STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); +static cl::opt<int> + DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), + cl::ZeroOrMore, + cl::desc("Default amount of inlining to perform")); + +static cl::opt<bool> PrintInstructionComments( + "print-instruction-comments", cl::Hidden, cl::init(false), + cl::desc("Prints comments for instruction based on inline cost analysis")); + static cl::opt<int> InlineThreshold( "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, cl::desc("Control the amount of inlining to perform (default = 225)")); @@ -92,8 +104,52 @@ static cl::opt<bool> OptComputeFullInlineCost( cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold.")); +static cl::opt<bool> InlineCallerSupersetNoBuiltin( + "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), + cl::ZeroOrMore, + cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " + "attributes.")); + +static cl::opt<bool> DisableGEPConstOperand( + "disable-gep-const-evaluation", cl::Hidden, cl::init(false), + cl::desc("Disables evaluation of GetElementPtr with constant operands")); + namespace { class InlineCostCallAnalyzer; + +// This struct is used to store information about inline cost of a +// particular instruction +struct InstructionCostDetail { + int CostBefore = 0; + int CostAfter = 0; + int ThresholdBefore = 0; + int ThresholdAfter = 0; + + int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } + + int getCostDelta() const { return CostAfter - CostBefore; } + + bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } +}; + +class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { +private: + InlineCostCallAnalyzer *const ICCA; + +public: + InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} + virtual void emitInstructionAnnot(const Instruction *I, + formatted_raw_ostream &OS) override; +}; + +/// Carry out call site analysis, in order to evaluate inlinability. +/// NOTE: the type is currently used as implementation detail of functions such +/// as llvm::getInlineCost. Note the function_ref constructor parameters - the +/// expectation is that they come from the outer scope, from the wrapper +/// functions. If we want to support constructing CallAnalyzer objects where +/// lambdas are provided inline at construction, or where the object needs to +/// otherwise survive past the scope of the provided functions, we need to +/// revisit the argument types. class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { typedef InstVisitor<CallAnalyzer, bool> Base; friend class InstVisitor<CallAnalyzer, bool>; @@ -104,10 +160,10 @@ protected: const TargetTransformInfo &TTI; /// Getter for the cache of @llvm.assume intrinsics. - std::function<AssumptionCache &(Function &)> &GetAssumptionCache; + function_ref<AssumptionCache &(Function &)> GetAssumptionCache; /// Getter for BlockFrequencyInfo - Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; + function_ref<BlockFrequencyInfo &(Function &)> GetBFI; /// Profile summary information. ProfileSummaryInfo *PSI; @@ -130,11 +186,16 @@ protected: /// Called after a basic block was analyzed. virtual void onBlockAnalyzed(const BasicBlock *BB) {} + /// Called before an instruction was analyzed + virtual void onInstructionAnalysisStart(const Instruction *I) {} + + /// Called after an instruction was analyzed + virtual void onInstructionAnalysisFinish(const Instruction *I) {} + /// Called at the end of the analysis of the callsite. Return the outcome of /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or /// the reason it can't. - virtual InlineResult finalizeAnalysis() { return true; } - + virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } /// Called when we're about to start processing a basic block, and every time /// we are done processing an instruction. Return true if there is no point in /// continuing the analysis (e.g. we've determined already the call site is @@ -145,8 +206,7 @@ protected: /// contexts propagated). It checks callsite-specific information. Return a /// reason analysis can't continue if that's the case, or 'true' if it may /// continue. - virtual InlineResult onAnalysisStart() { return true; } - + virtual InlineResult onAnalysisStart() { return InlineResult::success(); } /// Called if the analysis engine decides SROA cannot be done for the given /// alloca. virtual void onDisableSROA(AllocaInst *Arg) {} @@ -187,7 +247,7 @@ protected: /// Called to account for any other instruction not specifically accounted /// for. - virtual void onCommonInstructionSimplification() {} + virtual void onMissedSimplification() {} /// Start accounting potential benefits due to SROA for the given alloca. virtual void onInitializeSROAArg(AllocaInst *Arg) {} @@ -236,9 +296,7 @@ protected: DenseMap<Value *, AllocaInst *> SROAArgValues; /// Keep track of Allocas for which we believe we may get SROA optimization. - /// We don't delete entries in SROAArgValue because we still want - /// isAllocaDerivedArg to function correctly. - DenseSet<AllocaInst *> EnabledSROAArgValues; + DenseSet<AllocaInst *> EnabledSROAAllocas; /// Keep track of values which map to a pointer base and constant offset. DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; @@ -258,8 +316,7 @@ protected: AllocaInst *getSROAArgForValueOrNull(Value *V) const { auto It = SROAArgValues.find(V); - if (It == SROAArgValues.end() || - EnabledSROAArgValues.count(It->second) == 0) + if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0) return nullptr; return It->second; } @@ -337,17 +394,24 @@ protected: bool visitUnreachableInst(UnreachableInst &I); public: - CallAnalyzer(const TargetTransformInfo &TTI, - std::function<AssumptionCache &(Function &)> &GetAssumptionCache, - Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, - ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, - Function &Callee, CallBase &Call) + CallAnalyzer( + Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, + function_ref<AssumptionCache &(Function &)> GetAssumptionCache, + function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, + ProfileSummaryInfo *PSI = nullptr, + OptimizationRemarkEmitter *ORE = nullptr) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), CandidateCall(Call), EnableLoadElimination(true) {} InlineResult analyze(); + Optional<Constant*> getSimplifiedValue(Instruction *I) { + if (SimplifiedValues.find(I) != SimplifiedValues.end()) + return SimplifiedValues[I]; + return None; + } + // Keep a bunch of stats about the cost savings found so we can print them // out when debugging. unsigned NumConstantArgs = 0; @@ -375,6 +439,11 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { /// Tunable parameters that control the analysis. const InlineParams &Params; + // This DenseMap stores the delta change in cost and threshold after + // accounting for the given instruction. The map is filled only with the + // flag PrintInstructionComments on. + DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; + /// Upper bound for the inlining cost. Bonuses are being applied to account /// for speculative "expected profit" of the inlining decision. int Threshold = 0; @@ -382,6 +451,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { /// Attempt to evaluate indirect calls to boost its inline cost. const bool BoostIndirectCalls; + /// Ignore the threshold when finalizing analysis. + const bool IgnoreThreshold; + /// Inlining cost measured in abstract units, accounts for all the /// instructions expected to be executed for a given function invocation. /// Instructions that are statically proven to be dead based on call-site @@ -456,9 +528,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { InlineConstants::IndirectCallThreshold; /// FIXME: if InlineCostCallAnalyzer is derived from, this may need /// to instantiate the derived class. - InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, - Call, IndirectCallParams, false); - if (CA.analyze()) { + InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, + GetAssumptionCache, GetBFI, PSI, ORE, false); + if (CA.analyze().isSuccess()) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. Cost -= std::max(0, CA.getThreshold() - CA.getCost()); @@ -507,7 +579,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { addCost(SwitchCost, (int64_t)CostUpperBound); } - void onCommonInstructionSimplification() override { + void onMissedSimplification() override { addCost(InlineConstants::InstrCost); } @@ -515,7 +587,6 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { assert(Arg != nullptr && "Should not initialize SROA costs for null value."); SROAArgCosts[Arg] = 0; - EnabledSROAArgValues.insert(Arg); } void onAggregateSROAUse(AllocaInst *SROAArg) override { @@ -538,6 +609,25 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { SingleBB = false; } } + + void onInstructionAnalysisStart(const Instruction *I) override { + // This function is called to store the initial cost of inlining before + // the given instruction was assessed. + if (!PrintInstructionComments) + return; + InstructionCostDetailMap[I].CostBefore = Cost; + InstructionCostDetailMap[I].ThresholdBefore = Threshold; + } + + void onInstructionAnalysisFinish(const Instruction *I) override { + // This function is called to find new values of cost and threshold after + // the instruction has been assessed. + if (!PrintInstructionComments) + return; + InstructionCostDetailMap[I].CostAfter = Cost; + InstructionCostDetailMap[I].ThresholdAfter = Threshold; + } + InlineResult finalizeAnalysis() override { // Loops generally act a lot like calls in that they act like barriers to // movement, require a certain amount of setup, etc. So when optimising for @@ -566,12 +656,14 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { else if (NumVectorInstructions <= NumInstructions / 2) Threshold -= VectorBonus / 2; - return Cost < std::max(1, Threshold); + if (IgnoreThreshold || Cost < std::max(1, Threshold)) + return InlineResult::success(); + return InlineResult::failure("Cost over threshold."); } bool shouldStop() override { // Bail out the moment we cross the threshold. This means we'll under-count // the cost, but only when undercounting doesn't matter. - return Cost >= Threshold && !ComputeFullInlineCost; + return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost; } void onLoadEliminationOpportunity() override { @@ -618,25 +710,42 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { // Check if we're done. This can happen due to bonuses and penalties. if (Cost >= Threshold && !ComputeFullInlineCost) - return "high cost"; + return InlineResult::failure("high cost"); - return true; + return InlineResult::success(); } public: InlineCostCallAnalyzer( + Function &Callee, CallBase &Call, const InlineParams &Params, const TargetTransformInfo &TTI, - std::function<AssumptionCache &(Function &)> &GetAssumptionCache, - Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, - ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, - CallBase &Call, const InlineParams &Params, bool BoostIndirect = true) - : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call), + function_ref<AssumptionCache &(Function &)> GetAssumptionCache, + function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, + ProfileSummaryInfo *PSI = nullptr, + OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, + bool IgnoreThreshold = false) + : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE), ComputeFullInlineCost(OptComputeFullInlineCost || Params.ComputeFullInlineCost || ORE), Params(Params), Threshold(Params.DefaultThreshold), - BoostIndirectCalls(BoostIndirect) {} + BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), + Writer(this) {} + + /// Annotation Writer for instruction details + InlineCostAnnotationWriter Writer; + void dump(); + // Prints the same analysis as dump(), but its definition is not dependent + // on the build. + void print(); + + Optional<InstructionCostDetail> getCostDetails(const Instruction *I) { + if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end()) + return InstructionCostDetailMap[I]; + return None; + } + virtual ~InlineCostCallAnalyzer() {} int getThreshold() { return Threshold; } int getCost() { return Cost; } @@ -650,9 +759,35 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) { void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { onDisableSROA(SROAArg); - EnabledSROAArgValues.erase(SROAArg); + EnabledSROAAllocas.erase(SROAArg); disableLoadElimination(); } + +void InlineCostAnnotationWriter::emitInstructionAnnot(const Instruction *I, + formatted_raw_ostream &OS) { + // The cost of inlining of the given instruction is printed always. + // The threshold delta is printed only when it is non-zero. It happens + // when we decided to give a bonus at a particular instruction. + Optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); + if (!Record) + OS << "; No analysis for the instruction"; + else { + OS << "; cost before = " << Record->CostBefore + << ", cost after = " << Record->CostAfter + << ", threshold before = " << Record->ThresholdBefore + << ", threshold after = " << Record->ThresholdAfter << ", "; + OS << "cost delta = " << Record->getCostDelta(); + if (Record->hasThresholdChanged()) + OS << ", threshold delta = " << Record->getThresholdDelta(); + } + auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I)); + if (C) { + OS << ", simplified to "; + C.getValue()->print(OS, true); + } + OS << "\n"; +} + /// If 'V' maps to a SROA candidate, disable SROA for it. void CallAnalyzer::disableSROA(Value *V) { if (auto *SROAArg = getSROAArgForValueOrNull(V)) { @@ -711,7 +846,9 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { Operands.push_back(SimpleOp); else Operands.push_back(*I); - return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); + return TargetTransformInfo::TCC_Free == + TTI.getUserCost(&GEP, Operands, + TargetTransformInfo::TCK_SizeAndLatency); } bool CallAnalyzer::visitAlloca(AllocaInst &I) { @@ -720,10 +857,22 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) { if (I.isArrayAllocation()) { Constant *Size = SimplifiedValues.lookup(I.getArraySize()); if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { + // Sometimes a dynamic alloca could be converted into a static alloca + // after this constant prop, and become a huge static alloca on an + // unconditional CFG path. Avoid inlining if this is going to happen above + // a threshold. + // FIXME: If the threshold is removed or lowered too much, we could end up + // being too pessimistic and prevent inlining non-problematic code. This + // could result in unintended perf regressions. A better overall strategy + // is needed to track stack usage during inlining. Type *Ty = I.getAllocatedType(); AllocatedSize = SaturatingMultiplyAdd( AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize); + if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) { + HasDynamicAlloca = true; + return false; + } return Base::visitAlloca(I); } } @@ -874,6 +1023,16 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { return true; }; + if (!DisableGEPConstOperand) + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + SmallVector<Constant *, 2> Indices; + for (unsigned int Index = 1 ; Index < COps.size() ; ++Index) + Indices.push_back(COps[Index]); + return ConstantExpr::getGetElementPtr(I.getSourceElementType(), COps[0], + Indices, I.isInBounds()); + })) + return true; + if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { if (SROAArg) SROAArgValues[&I] = SROAArg; @@ -959,7 +1118,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) SROAArgValues[&I] = SROAArg; - return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); + return TargetTransformInfo::TCC_Free == + TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency); } bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { @@ -983,7 +1143,8 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { if (auto *SROAArg = getSROAArgForValueOrNull(Op)) SROAArgValues[&I] = SROAArg; - return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); + return TargetTransformInfo::TCC_Free == + TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency); } bool CallAnalyzer::visitCastInst(CastInst &I) { @@ -993,7 +1154,8 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { })) return true; - // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. + // Disable SROA in the face of arbitrary casts we don't explicitly list + // elsewhere. disableSROA(I.getOperand(0)); // If this is a floating-point cast, and the target says this operation @@ -1013,7 +1175,8 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { break; } - return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); + return TargetTransformInfo::TCC_Free == + TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency); } bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { @@ -1085,7 +1248,7 @@ bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, // If global profile summary is available, then callsite's coldness is // determined based on that. if (PSI && PSI->hasProfileSummary()) - return PSI->isColdCallSite(CallSite(&Call), CallerBFI); + return PSI->isColdCallSite(Call, CallerBFI); // Otherwise we need BFI to be available. if (!CallerBFI) @@ -1109,8 +1272,7 @@ InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, // If global profile summary is available, then callsite's hotness is // determined based on that. - if (PSI && PSI->hasProfileSummary() && - PSI->isHotCallSite(CallSite(&Call), CallerBFI)) + if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI)) return Params.HotCallSiteThreshold; // Otherwise we need BFI to be available and to have a locally hot callsite @@ -1200,7 +1362,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // Callsite hotness and coldness can be determined if sample profile is // used (which adds hotness metadata to calls) or if caller's // BlockFrequencyInfo is available. - BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; + BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); if (!Caller->hasOptSize() && HotCallSiteThreshold) { LLVM_DEBUG(dbgs() << "Hot callsite.\n"); @@ -1667,7 +1829,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { // does not (yet) fire. unsigned JumpTableSize = 0; - BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr; + BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; unsigned NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); @@ -1716,7 +1878,8 @@ bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { bool CallAnalyzer::visitInstruction(Instruction &I) { // Some instructions are free. All of the free intrinsics can also be // handled by SROA, etc. - if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) + if (TargetTransformInfo::TCC_Free == + TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency)) return true; // We found something we don't understand or can't handle. Mark any SROA-able @@ -1761,33 +1924,36 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, // all of the per-instruction logic. The visit tree returns true if we // consumed the instruction in any way, and false if the instruction's base // cost should count against inlining. + onInstructionAnalysisStart(&*I); + if (Base::visit(&*I)) ++NumInstructionsSimplified; else - onCommonInstructionSimplification(); + onMissedSimplification(); + onInstructionAnalysisFinish(&*I); using namespace ore; // If the visit this instruction detected an uninlinable pattern, abort. - InlineResult IR; + InlineResult IR = InlineResult::success(); if (IsRecursiveCall) - IR = "recursive"; + IR = InlineResult::failure("recursive"); else if (ExposesReturnsTwice) - IR = "exposes returns twice"; + IR = InlineResult::failure("exposes returns twice"); else if (HasDynamicAlloca) - IR = "dynamic alloca"; + IR = InlineResult::failure("dynamic alloca"); else if (HasIndirectBr) - IR = "indirect branch"; + IR = InlineResult::failure("indirect branch"); else if (HasUninlineableIntrinsic) - IR = "uninlinable intrinsic"; + IR = InlineResult::failure("uninlinable intrinsic"); else if (InitsVargArgs) - IR = "varargs"; - if (!IR) { + IR = InlineResult::failure("varargs"); + if (!IR.isSuccess()) { if (ORE) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CandidateCall) << NV("Callee", &F) << " has uninlinable pattern (" - << NV("InlineResult", IR.message) + << NV("InlineResult", IR.getFailureReason()) << ") and cost is not fully computed"; }); return IR; @@ -1798,22 +1964,25 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, // the caller stack usage dramatically. if (IsCallerRecursive && AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { - InlineResult IR = "recursive and allocates too much stack space"; + auto IR = + InlineResult::failure("recursive and allocates too much stack space"); if (ORE) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CandidateCall) - << NV("Callee", &F) << " is " << NV("InlineResult", IR.message) + << NV("Callee", &F) << " is " + << NV("InlineResult", IR.getFailureReason()) << ". Cost is not fully computed"; }); return IR; } if (shouldStop()) - return false; + return InlineResult::failure( + "Call site analysis is not favorable to inlining."); } - return true; + return InlineResult::success(); } /// Compute the base pointer and cumulative constant offsets for V. @@ -1904,11 +2073,11 @@ InlineResult CallAnalyzer::analyze() { ++NumCallsAnalyzed; auto Result = onAnalysisStart(); - if (!Result) + if (!Result.isSuccess()) return Result; if (F.empty()) - return true; + return InlineResult::success(); Function *Caller = CandidateCall.getFunction(); // Check if the caller function is recursive itself. @@ -1937,6 +2106,7 @@ InlineResult CallAnalyzer::analyze() { if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) { SROAArgValues[&*FAI] = SROAArg; onInitializeSROAArg(SROAArg); + EnabledSROAAllocas.insert(SROAArg); } } } @@ -1983,12 +2153,12 @@ InlineResult CallAnalyzer::analyze() { if (BB->hasAddressTaken()) for (User *U : BlockAddress::get(&*BB)->users()) if (!isa<CallBrInst>(*U)) - return "blockaddress used outside of callbr"; + return InlineResult::failure("blockaddress used outside of callbr"); // Analyze the cost of this block. If we blow through the threshold, this // returns false, and we can bail on out. InlineResult IR = analyzeBlock(BB, EphValues); - if (!IR) + if (!IR.isSuccess()) return IR; Instruction *TI = BB->getTerminator(); @@ -2034,15 +2204,15 @@ InlineResult CallAnalyzer::analyze() { // inlining this would cause the removal of the caller (so the instruction // is not actually duplicated, just moved). if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) - return "noduplicate"; + return InlineResult::failure("noduplicate"); return finalizeAnalysis(); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -/// Dump stats about this call's analysis. -LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { +void InlineCostCallAnalyzer::print() { #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" + if (PrintInstructionComments) + F.print(dbgs(), &Writer); DEBUG_PRINT_STAT(NumConstantArgs); DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); DEBUG_PRINT_STAT(NumAllocaArgs); @@ -2058,14 +2228,27 @@ LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { DEBUG_PRINT_STAT(Threshold); #undef DEBUG_PRINT_STAT } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +/// Dump stats about this call's analysis. +LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { + print(); +} #endif /// Test that there are no attribute conflicts between Caller and Callee /// that prevent inlining. -static bool functionsHaveCompatibleAttributes(Function *Caller, - Function *Callee, - TargetTransformInfo &TTI) { +static bool functionsHaveCompatibleAttributes( + Function *Caller, Function *Callee, TargetTransformInfo &TTI, + function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { + // Note that CalleeTLI must be a copy not a reference. The legacy pass manager + // caches the most recently created TLI in the TargetLibraryInfoWrapperPass + // object, and always returns the same object (which is overwritten on each + // GetTLI call). Therefore we copy the first result. + auto CalleeTLI = GetTLI(*Callee); return TTI.areInlineCompatible(Caller, Callee) && + GetTLI(*Caller).areInlineCompatible(CalleeTLI, + InlineCallerSupersetNoBuiltin) && AttributeFuncs::areInlineCompatible(*Caller, *Callee); } @@ -2104,23 +2287,46 @@ int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { InlineCost llvm::getInlineCost( CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, - std::function<AssumptionCache &(Function &)> &GetAssumptionCache, - Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, + function_ref<AssumptionCache &(Function &)> GetAssumptionCache, + function_ref<const TargetLibraryInfo &(Function &)> GetTLI, + function_ref<BlockFrequencyInfo &(Function &)> GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, - GetAssumptionCache, GetBFI, PSI, ORE); + GetAssumptionCache, GetTLI, GetBFI, PSI, ORE); } -InlineCost llvm::getInlineCost( - CallBase &Call, Function *Callee, const InlineParams &Params, - TargetTransformInfo &CalleeTTI, - std::function<AssumptionCache &(Function &)> &GetAssumptionCache, - Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, +Optional<int> llvm::getInliningCostEstimate( + CallBase &Call, TargetTransformInfo &CalleeTTI, + function_ref<AssumptionCache &(Function &)> GetAssumptionCache, + function_ref<BlockFrequencyInfo &(Function &)> GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { + const InlineParams Params = {/* DefaultThreshold*/ 0, + /*HintThreshold*/ {}, + /*ColdThreshold*/ {}, + /*OptSizeThreshold*/ {}, + /*OptMinSizeThreshold*/ {}, + /*HotCallSiteThreshold*/ {}, + /*LocallyHotCallSiteThreshold*/ {}, + /*ColdCallSiteThreshold*/ {}, + /*ComputeFullInlineCost*/ true, + /*EnableDeferral*/ true}; + + InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, + GetAssumptionCache, GetBFI, PSI, ORE, true, + /*IgnoreThreshold*/ true); + auto R = CA.analyze(); + if (!R.isSuccess()) + return None; + return CA.getCost(); +} + +Optional<InlineResult> llvm::getAttributeBasedInliningDecision( + CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, + function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { // Cannot inline indirect calls. if (!Callee) - return llvm::InlineCost::getNever("indirect call"); + return InlineResult::failure("indirect call"); // Never inline calls with byval arguments that does not have the alloca // address space. Since byval arguments can be replaced with a copy to an @@ -2132,59 +2338,80 @@ InlineCost llvm::getInlineCost( if (Call.isByValArgument(I)) { PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); if (PTy->getAddressSpace() != AllocaAS) - return llvm::InlineCost::getNever("byval arguments without alloca" - " address space"); + return InlineResult::failure("byval arguments without alloca" + " address space"); } // Calls to functions with always-inline attributes should be inlined // whenever possible. if (Call.hasFnAttr(Attribute::AlwaysInline)) { auto IsViable = isInlineViable(*Callee); - if (IsViable) - return llvm::InlineCost::getAlways("always inline attribute"); - return llvm::InlineCost::getNever(IsViable.message); + if (IsViable.isSuccess()) + return InlineResult::success(); + return InlineResult::failure(IsViable.getFailureReason()); } // Never inline functions with conflicting attributes (unless callee has // always-inline attribute). Function *Caller = Call.getCaller(); - if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) - return llvm::InlineCost::getNever("conflicting attributes"); + if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI)) + return InlineResult::failure("conflicting attributes"); // Don't inline this call if the caller has the optnone attribute. if (Caller->hasOptNone()) - return llvm::InlineCost::getNever("optnone attribute"); + return InlineResult::failure("optnone attribute"); // Don't inline a function that treats null pointer as valid into a caller // that does not have this attribute. if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) - return llvm::InlineCost::getNever("nullptr definitions incompatible"); + return InlineResult::failure("nullptr definitions incompatible"); // Don't inline functions which can be interposed at link-time. if (Callee->isInterposable()) - return llvm::InlineCost::getNever("interposable"); + return InlineResult::failure("interposable"); // Don't inline functions marked noinline. if (Callee->hasFnAttribute(Attribute::NoInline)) - return llvm::InlineCost::getNever("noinline function attribute"); + return InlineResult::failure("noinline function attribute"); // Don't inline call sites marked noinline. if (Call.isNoInline()) - return llvm::InlineCost::getNever("noinline call site attribute"); + return InlineResult::failure("noinline call site attribute"); + + return None; +} + +InlineCost llvm::getInlineCost( + CallBase &Call, Function *Callee, const InlineParams &Params, + TargetTransformInfo &CalleeTTI, + function_ref<AssumptionCache &(Function &)> GetAssumptionCache, + function_ref<const TargetLibraryInfo &(Function &)> GetTLI, + function_ref<BlockFrequencyInfo &(Function &)> GetBFI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { + + auto UserDecision = + llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); + + if (UserDecision.hasValue()) { + if (UserDecision->isSuccess()) + return llvm::InlineCost::getAlways("always inline attribute"); + return llvm::InlineCost::getNever(UserDecision->getFailureReason()); + } LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() - << "... (caller:" << Caller->getName() << ")\n"); + << "... (caller:" << Call.getCaller()->getName() + << ")\n"); - InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, - *Callee, Call, Params); + InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, + GetAssumptionCache, GetBFI, PSI, ORE); InlineResult ShouldInline = CA.analyze(); LLVM_DEBUG(CA.dump()); // Check if there was a reason to force inlining or no inlining. - if (!ShouldInline && CA.getCost() < CA.getThreshold()) - return InlineCost::getNever(ShouldInline.message); - if (ShouldInline && CA.getCost() >= CA.getThreshold()) + if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold()) + return InlineCost::getNever(ShouldInline.getFailureReason()); + if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold()) return InlineCost::getAlways("empty function"); return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); @@ -2195,14 +2422,14 @@ InlineResult llvm::isInlineViable(Function &F) { for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Disallow inlining of functions which contain indirect branches. if (isa<IndirectBrInst>(BI->getTerminator())) - return "contains indirect branches"; + return InlineResult::failure("contains indirect branches"); // Disallow inlining of blockaddresses which are used by non-callbr // instructions. if (BI->hasAddressTaken()) for (User *U : BlockAddress::get(&*BI)->users()) if (!isa<CallBrInst>(*U)) - return "blockaddress used outside of callbr"; + return InlineResult::failure("blockaddress used outside of callbr"); for (auto &II : *BI) { CallBase *Call = dyn_cast<CallBase>(&II); @@ -2211,13 +2438,13 @@ InlineResult llvm::isInlineViable(Function &F) { // Disallow recursive calls. if (&F == Call->getCalledFunction()) - return "recursive call"; + return InlineResult::failure("recursive call"); // Disallow calls which expose returns-twice to a function not previously // attributed as such. if (!ReturnsTwice && isa<CallInst>(Call) && cast<CallInst>(Call)->canReturnTwice()) - return "exposes returns-twice attribute"; + return InlineResult::failure("exposes returns-twice attribute"); if (Call->getCalledFunction()) switch (Call->getCalledFunction()->getIntrinsicID()) { @@ -2226,20 +2453,23 @@ InlineResult llvm::isInlineViable(Function &F) { case llvm::Intrinsic::icall_branch_funnel: // Disallow inlining of @llvm.icall.branch.funnel because current // backend can't separate call targets from call arguments. - return "disallowed inlining of @llvm.icall.branch.funnel"; + return InlineResult::failure( + "disallowed inlining of @llvm.icall.branch.funnel"); case llvm::Intrinsic::localescape: // Disallow inlining functions that call @llvm.localescape. Doing this // correctly would require major changes to the inliner. - return "disallowed inlining of @llvm.localescape"; + return InlineResult::failure( + "disallowed inlining of @llvm.localescape"); case llvm::Intrinsic::vastart: // Disallow inlining of functions that initialize VarArgs with // va_start. - return "contains VarArgs initialized with va_start"; + return InlineResult::failure( + "contains VarArgs initialized with va_start"); } } } - return true; + return InlineResult::success(); } // APIs to create InlineParams based on command line flags and/or other @@ -2299,7 +2529,7 @@ InlineParams llvm::getInlineParams(int Threshold) { } InlineParams llvm::getInlineParams() { - return getInlineParams(InlineThreshold); + return getInlineParams(DefaultThreshold); } // Compute the default threshold for inlining based on the opt level and the @@ -2312,7 +2542,7 @@ static int computeThresholdFromOptLevels(unsigned OptLevel, return InlineConstants::OptSizeThreshold; if (SizeOptLevel == 2) // -Oz return InlineConstants::OptMinSizeThreshold; - return InlineThreshold; + return DefaultThreshold; } InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { @@ -2325,3 +2555,40 @@ InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; return Params; } + +PreservedAnalyses +InlineCostAnnotationPrinterPass::run(Function &F, + FunctionAnalysisManager &FAM) { + PrintInstructionComments = true; + std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&]( + Function &F) -> AssumptionCache & { + return FAM.getResult<AssumptionAnalysis>(F); + }; + Module *M = F.getParent(); + ProfileSummaryInfo PSI(*M); + DataLayout DL(M); + TargetTransformInfo TTI(DL); + // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. + // In the current implementation, the type of InlineParams doesn't matter as + // the pass serves only for verification of inliner's decisions. + // We can add a flag which determines InlineParams for this run. Right now, + // the default InlineParams are used. + const InlineParams Params = llvm::getInlineParams(); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (CallInst *CI = dyn_cast<CallInst>(&I)) { + Function *CalledFunction = CI->getCalledFunction(); + if (!CalledFunction || CalledFunction->isDeclaration()) + continue; + OptimizationRemarkEmitter ORE(CalledFunction); + InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI, + GetAssumptionCache, nullptr, &PSI, &ORE); + ICCA.analyze(); + OS << " Analyzing call of " << CalledFunction->getName() + << "... (caller:" << CI->getCaller()->getName() << ")\n"; + ICCA.print(); + } + } + } + return PreservedAnalyses::all(); +} |