diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopInterchange.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 298 |
1 files changed, 150 insertions, 148 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 18daa4295224..0a7c62113c7f 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -44,6 +44,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> @@ -86,7 +87,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) { #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, - Loop *L, DependenceInfo *DI) { + Loop *L, DependenceInfo *DI, + ScalarEvolution *SE) { using ValueVector = SmallVector<Value *, 16>; ValueVector MemInstr; @@ -125,6 +127,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, // Track Output, Flow, and Anti dependencies. if (auto D = DI->depends(Src, Dst, true)) { assert(D->isOrdered() && "Expected an output, flow or anti dep."); + // If the direction vector is negative, normalize it to + // make it non-negative. + if (D->normalize(SE)) + LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n"); LLVM_DEBUG(StringRef DepType = D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; dbgs() << "Found " << DepType @@ -133,19 +139,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, unsigned Levels = D->getLevels(); char Direction; for (unsigned II = 1; II <= Levels; ++II) { - const SCEV *Distance = D->getDistance(II); - const SCEVConstant *SCEVConst = - dyn_cast_or_null<SCEVConstant>(Distance); - if (SCEVConst) { - const ConstantInt *CI = SCEVConst->getValue(); - if (CI->isNegative()) - Direction = '<'; - else if (CI->isZero()) - Direction = '='; - else - Direction = '>'; - Dep.push_back(Direction); - } else if (D->isScalar(II)) { + if (D->isScalar(II)) { Direction = 'S'; Dep.push_back(Direction); } else { @@ -188,80 +182,36 @@ static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]); } -// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is -// '>' -static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, - unsigned Column) { - for (unsigned i = 0; i <= Column; ++i) { - if (DepMatrix[Row][i] == '<') - return false; - if (DepMatrix[Row][i] == '>') +// After interchanging, check if the direction vector is valid. +// [Theorem] A permutation of the loops in a perfect nest is legal if and only +// if the direction matrix, after the same permutation is applied to its +// columns, has no ">" direction as the leftmost non-"=" direction in any row. +static bool isLexicographicallyPositive(std::vector<char> &DV) { + for (unsigned Level = 0; Level < DV.size(); ++Level) { + unsigned char Direction = DV[Level]; + if (Direction == '<') return true; - } - // All dependencies were '=','S' or 'I' - return false; -} - -// Checks if no dependence exist in the dependency matrix in Row before Column. -static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, - unsigned Column) { - for (unsigned i = 0; i < Column; ++i) { - if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && - DepMatrix[Row][i] != 'I') + if (Direction == '>' || Direction == '*') return false; } return true; } -static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, - unsigned OuterLoopId, char InnerDep, - char OuterDep) { - if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) - return false; - - if (InnerDep == OuterDep) - return true; - - // It is legal to interchange if and only if after interchange no row has a - // '>' direction as the leftmost non-'='. - - if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') - return true; - - if (InnerDep == '<') - return true; - - if (InnerDep == '>') { - // If OuterLoopId represents outermost loop then interchanging will make the - // 1st dependency as '>' - if (OuterLoopId == 0) - return false; - - // If all dependencies before OuterloopId are '=','S'or 'I'. Then - // interchanging will result in this row having an outermost non '=' - // dependency of '>' - if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) - return true; - } - - return false; -} - // Checks if it is legal to interchange 2 loops. -// [Theorem] A permutation of the loops in a perfect nest is legal if and only -// if the direction matrix, after the same permutation is applied to its -// columns, has no ">" direction as the leftmost non-"=" direction in any row. static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId) { unsigned NumRows = DepMatrix.size(); + std::vector<char> Cur; // For each row check if it is valid to interchange. for (unsigned Row = 0; Row < NumRows; ++Row) { - char InnerDep = DepMatrix[Row][InnerLoopId]; - char OuterDep = DepMatrix[Row][OuterLoopId]; - if (InnerDep == '*' || OuterDep == '*') + // Create temporary DepVector check its lexicographical order + // before and after swapping OuterLoop vs InnerLoop + Cur = DepMatrix[Row]; + if (!isLexicographicallyPositive(Cur)) return false; - if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) + std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); + if (!isLexicographicallyPositive(Cur)) return false; } return true; @@ -361,11 +311,18 @@ public: bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, - const DenseMap<const Loop *, unsigned> &CostMap); + const DenseMap<const Loop *, unsigned> &CostMap, + std::unique_ptr<CacheCost> &CC); private: int getInstrOrderCost(); - + std::optional<bool> isProfitablePerLoopCacheAnalysis( + const DenseMap<const Loop *, unsigned> &CostMap, + std::unique_ptr<CacheCost> &CC); + std::optional<bool> isProfitablePerInstrOrderCost(); + std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix); Loop *OuterLoop; Loop *InnerLoop; @@ -486,7 +443,7 @@ struct LoopInterchange { CharMatrix DependencyMatrix; Loop *OuterMostLoop = *(LoopList.begin()); if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, - OuterMostLoop, DI)) { + OuterMostLoop, DI, SE)) { LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; } @@ -562,7 +519,7 @@ struct LoopInterchange { LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, CostMap)) { + DependencyMatrix, CostMap, CC)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -579,11 +536,7 @@ struct LoopInterchange { LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); LoopsInterchanged++; - assert(InnerLoop->isLCSSAForm(*DT) && - "Inner loop not left in LCSSA form after loop interchange!"); - assert(OuterLoop->isLCSSAForm(*DT) && - "Outer loop not left in LCSSA form after loop interchange!"); - + llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE); return true; } }; @@ -858,18 +811,26 @@ bool LoopInterchangeLegality::currentLimitations() { } Inductions.clear(); - if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { - LLVM_DEBUG( - dbgs() << "Only inner loops with induction or reduction PHI nodes " - << "are supported currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with induction or reduction PHI nodes can be" - " interchange currently."; - }); - return true; + // For multi-level loop nests, make sure that all phi nodes for inner loops + // at all levels can be recognized as a induction or reduction phi. Bail out + // if a phi node at a certain nesting level cannot be properly recognized. + Loop *CurLevelLoop = OuterLoop; + while (!CurLevelLoop->getSubLoops().empty()) { + // We already made sure that the loop nest is tightly nested. + CurLevelLoop = CurLevelLoop->getSubLoops().front(); + if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) { + LLVM_DEBUG( + dbgs() << "Only inner loops with induction or reduction PHI nodes " + << "are supported currently.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", + CurLevelLoop->getStartLoc(), + CurLevelLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."; + }); + return true; + } } // TODO: Triangular loops are not handled for now. @@ -1137,31 +1098,10 @@ int LoopInterchangeProfitability::getInstrOrderCost() { return GoodOrder - BadOrder; } -static bool isProfitableForVectorization(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Improve this heuristic to catch more cases. - // If the inner loop is loop independent or doesn't carry any dependency it is - // profitable to move this to outer position. - for (auto &Row : DepMatrix) { - if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') - return false; - // TODO: We need to improve this heuristic. - if (Row[OuterLoopId] != '=') - return false; - } - // If outer loop has dependence and inner loop is loop independent then it is - // profitable to interchange to enable parallelism. - // If there are no dependences, interchanging will not improve anything. - return !DepMatrix.empty(); -} - -bool LoopInterchangeProfitability::isProfitable( - const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, - unsigned OuterLoopId, CharMatrix &DepMatrix, - const DenseMap<const Loop *, unsigned> &CostMap) { - // TODO: Remove the legacy cost model. - +std::optional<bool> +LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis( + const DenseMap<const Loop *, unsigned> &CostMap, + std::unique_ptr<CacheCost> &CC) { // This is the new cost model returned from loop cache analysis. // A smaller index means the loop should be placed an outer loop, and vice // versa. @@ -1173,30 +1113,91 @@ bool LoopInterchangeProfitability::isProfitable( LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex << ", OuterIndex = " << OuterIndex << "\n"); if (InnerIndex < OuterIndex) - return true; - } else { - // Legacy cost model: this is rough cost estimation algorithm. It counts the - // good and bad order of induction variables in the instruction and allows - // reordering if number of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; + return std::optional<bool>(true); + assert(InnerIndex != OuterIndex && "CostMap should assign unique " + "numbers to each loop"); + if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop)) + return std::nullopt; + return std::optional<bool>(false); } + return std::nullopt; +} - // It is not profitable as per current cache profitability model. But check if - // we can move this loop outside to improve parallelism. - if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) - return true; +std::optional<bool> +LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0 && Cost < LoopInterchangeCostThreshold) + return std::optional<bool>(true); + + return std::nullopt; +} - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Interchanging loops is too costly and it does not improve " - "parallelism."; - }); - return false; +std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( + unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { + for (auto &Row : DepMatrix) { + // If the inner loop is loop independent or doesn't carry any dependency + // it is not profitable to move this to outer position, since we are + // likely able to do inner loop vectorization already. + if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=') + return std::optional<bool>(false); + + // If the outer loop is not loop independent it is not profitable to move + // this to inner position, since doing so would not enable inner loop + // parallelism. + if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=') + return std::optional<bool>(false); + } + // If inner loop has dependence and outer loop is loop independent then it + // is/ profitable to interchange to enable inner loop parallelism. + // If there are no dependences, interchanging will not improve anything. + return std::optional<bool>(!DepMatrix.empty()); +} + +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap<const Loop *, unsigned> &CostMap, + std::unique_ptr<CacheCost> &CC) { + // isProfitable() is structured to avoid endless loop interchange. + // If loop cache analysis could decide the profitability then, + // profitability check will stop and return the analysis result. + // If cache analysis failed to analyze the loopnest (e.g., + // due to delinearization issues) then only check whether it is + // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to + // analysis the profitability then only, isProfitableForVectorization + // will decide. + std::optional<bool> shouldInterchange = + isProfitablePerLoopCacheAnalysis(CostMap, CC); + if (!shouldInterchange.has_value()) { + shouldInterchange = isProfitablePerInstrOrderCost(); + if (!shouldInterchange.has_value()) + shouldInterchange = + isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); + } + if (!shouldInterchange.has_value()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Insufficient information to calculate the cost of loop for " + "interchange."; + }); + return false; + } else if (!shouldInterchange.value()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops is not considered to improve cache " + "locality nor vectorization."; + }); + return false; + } + return true; } void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, @@ -1286,7 +1287,6 @@ void LoopInterchangeTransform::restructureLoops( // Tell SE that we move the loops around. SE->forgetLoop(NewOuter); - SE->forgetLoop(NewInner); } bool LoopInterchangeTransform::transform() { @@ -1360,9 +1360,11 @@ bool LoopInterchangeTransform::transform() { for (Instruction *InnerIndexVar : InnerIndexVarList) WorkList.insert(cast<Instruction>(InnerIndexVar)); MoveInstructions(); + } - // Splits the inner loops phi nodes out into a separate basic block. - BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + // Ensure the inner loop phi nodes have a separate basic block. + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) { SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); } @@ -1394,11 +1396,10 @@ bool LoopInterchangeTransform::transform() { /// \brief Move all instructions except the terminator from FromBB right before /// InsertBefore static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { - auto &ToList = InsertBefore->getParent()->getInstList(); - auto &FromList = FromBB->getInstList(); + BasicBlock *ToBB = InsertBefore->getParent(); - ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), - FromBB->getTerminator()->getIterator()); + ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(), + FromBB->getTerminator()->getIterator()); } /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. @@ -1773,5 +1774,6 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, OptimizationRemarkEmitter ORE(&F); if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) return PreservedAnalyses::all(); + U.markLoopNestChanged(true); return getLoopPassPreservedAnalyses(); } |