diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Scalar')
56 files changed, 3161 insertions, 2409 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp index 6f3fdb88eda5..b693acceb3f6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -538,7 +538,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() { // that have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : instructions(F)) { + for (Instruction &I : llvm::reverse(instructions(F))) { // Check if the instruction is alive. if (isLive(&I)) continue; @@ -554,9 +554,11 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() { // Prepare to delete. Worklist.push_back(&I); salvageDebugInfo(I); - I.dropAllReferences(); } + for (Instruction *&I : Worklist) + I->dropAllReferences(); + for (Instruction *&I : Worklist) { ++NumRemoved; I->eraseFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp index c06125788f37..6c2467db79f7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -53,7 +53,7 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { // in the def-use chain needs to be changed. auto *J = dyn_cast<Instruction>(JU); if (J && J->getType()->isIntOrIntVectorTy() && - !DB.getDemandedBits(J).isAllOnesValue()) { + !DB.getDemandedBits(J).isAllOnes()) { Visited.insert(J); WorkList.push_back(J); } @@ -84,7 +84,7 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { // that in the def-use chain needs to be changed. auto *K = dyn_cast<Instruction>(KU); if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() && - !DB.getDemandedBits(K).isAllOnesValue()) + !DB.getDemandedBits(K).isAllOnes()) WorkList.push_back(K); } } @@ -103,12 +103,9 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { // Remove instructions that are dead, either because they were not reached // during analysis or have no demanded bits. if (DB.isInstructionDead(&I) || - (I.getType()->isIntOrIntVectorTy() && - DB.getDemandedBits(&I).isNullValue() && + (I.getType()->isIntOrIntVectorTy() && DB.getDemandedBits(&I).isZero() && wouldInstructionBeTriviallyDead(&I))) { - salvageDebugInfo(I); Worklist.push_back(&I); - I.dropAllReferences(); Changed = true; continue; } @@ -155,6 +152,11 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { } } + for (Instruction *&I : llvm::reverse(Worklist)) { + salvageDebugInfo(*I); + I->dropAllReferences(); + } + for (Instruction *&I : Worklist) { ++NumRemoved; I->eraseFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index 2eb94b721d96..95de59fa8262 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -467,7 +467,7 @@ static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB, BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr; SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS; - for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) { + for (auto *Pred : llvm::reverse(Preds)) { ConditionsTy Conditions; // Record condition on edge BB(CS) <- Pred recordCondition(CB, Pred, CB.getParent(), Conditions); @@ -505,8 +505,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI, DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy); bool Changed = false; - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { - BasicBlock &BB = *BI++; + for (BasicBlock &BB : llvm::make_early_inc_range(F)) { auto II = BB.getFirstNonPHIOrDbg()->getIterator(); auto IE = BB.getTerminator()->getIterator(); // Iterate until we reach the terminator instruction. tryToSplitCallSite diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 535f50d4f904..27f54f8026e1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -762,7 +762,7 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx, cast<PointerType>(Ty)->getAddressSpace()); Base = new BitCastInst(Base, Int8PtrTy, "base_bitcast", InsertionPt); - Mat = GetElementPtrInst::Create(Int8PtrTy->getElementType(), Base, + Mat = GetElementPtrInst::Create(Type::getInt8Ty(*Ctx), Base, Offset, "mat_gep", InsertionPt); Mat = new BitCastInst(Mat, Ty, "mat_bitcast", InsertionPt); } else @@ -819,10 +819,9 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, // Aside from constant GEPs, only constant cast expressions are collected. assert(ConstExpr->isCast() && "ConstExpr should be a cast"); - Instruction *ConstExprInst = ConstExpr->getAsInstruction(); + Instruction *ConstExprInst = ConstExpr->getAsInstruction( + findMatInsertPt(ConstUser.Inst, ConstUser.OpndIdx)); ConstExprInst->setOperand(0, Mat); - ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst, - ConstUser.OpndIdx)); // Use the same debug location as the instruction we are about to update. ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index efd1c025d0cd..7f2d5d7d9987 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstraintSystem.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -268,6 +269,31 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { continue; WorkList.emplace_back(DT.getNode(&BB)); + // True as long as long as the current instruction is guaranteed to execute. + bool GuaranteedToExecute = true; + // Scan BB for assume calls. + // TODO: also use this scan to queue conditions to simplify, so we can + // interleave facts from assumes and conditions to simplify in a single + // basic block. And to skip another traversal of each basic block when + // simplifying. + for (Instruction &I : BB) { + Value *Cond; + // For now, just handle assumes with a single compare as condition. + if (match(&I, m_Intrinsic<Intrinsic::assume>(m_Value(Cond))) && + isa<CmpInst>(Cond)) { + if (GuaranteedToExecute) { + // The assume is guaranteed to execute when BB is entered, hence Cond + // holds on entry to BB. + WorkList.emplace_back(DT.getNode(&BB), cast<CmpInst>(Cond), false); + } else { + // Otherwise the condition only holds in the successors. + for (BasicBlock *Succ : successors(&BB)) + WorkList.emplace_back(DT.getNode(Succ), cast<CmpInst>(Cond), false); + } + } + GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I); + } + auto *Br = dyn_cast<BranchInst>(BB.getTerminator()); if (!Br || !Br->isConditional()) continue; @@ -395,8 +421,13 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { for (auto &E : reverse(DFSInStack)) dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n"; }); - Cmp->replaceAllUsesWith( - ConstantInt::getTrue(F.getParent()->getContext())); + Cmp->replaceUsesWithIf( + ConstantInt::getTrue(F.getParent()->getContext()), [](Use &U) { + // Conditions in an assume trivially simplify to true. Skip uses + // in assume calls to not destroy the available information. + auto *II = dyn_cast<IntrinsicInst>(U.getUser()); + return !II || II->getIntrinsicID() != Intrinsic::assume; + }); NumCondsRemoved++; Changed = true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 36cbd42a5fdd..ca9567dc7ac8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -67,6 +67,7 @@ STATISTIC(NumUDivURemsNarrowed, STATISTIC(NumAShrs, "Number of ashr converted to lshr"); STATISTIC(NumSRems, "Number of srem converted to urem"); STATISTIC(NumSExt, "Number of sext converted to zext"); +STATISTIC(NumSICmps, "Number of signed icmp preds simplified to unsigned"); STATISTIC(NumAnd, "Number of ands removed"); STATISTIC(NumNW, "Number of no-wrap deductions"); STATISTIC(NumNSW, "Number of no-signed-wrap deductions"); @@ -295,11 +296,34 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) { return true; } +static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { + // Only for signed relational comparisons of scalar integers. + if (Cmp->getType()->isVectorTy() || + !Cmp->getOperand(0)->getType()->isIntegerTy()) + return false; + + if (!Cmp->isSigned()) + return false; + + ICmpInst::Predicate UnsignedPred = + ConstantRange::getEquivalentPredWithFlippedSignedness( + Cmp->getPredicate(), LVI->getConstantRange(Cmp->getOperand(0), Cmp), + LVI->getConstantRange(Cmp->getOperand(1), Cmp)); + + if (UnsignedPred == ICmpInst::Predicate::BAD_ICMP_PREDICATE) + return false; + + ++NumSICmps; + Cmp->setPredicate(UnsignedPred); + + return true; +} + /// See if LazyValueInfo's ability to exploit edge conditions or range /// information is sufficient to prove this comparison. Even for local /// conditions, this can sometimes prove conditions instcombine can't by /// exploiting range information. -static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) { +static bool constantFoldCmp(CmpInst *Cmp, LazyValueInfo *LVI) { Value *Op0 = Cmp->getOperand(0); auto *C = dyn_cast<Constant>(Cmp->getOperand(1)); if (!C) @@ -318,6 +342,17 @@ static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) { return true; } +static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) { + if (constantFoldCmp(Cmp, LVI)) + return true; + + if (auto *ICmp = dyn_cast<ICmpInst>(Cmp)) + if (processICmp(ICmp, LVI)) + return true; + + return false; +} + /// Simplify a switch instruction by removing cases which can never fire. If the /// uselessness of a case could be determined locally then constant propagation /// would already have figured it out. Instead, walk the predecessors and @@ -341,7 +376,13 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, // ConstantFoldTerminator() as the underlying SwitchInst can be changed. SwitchInstProfUpdateWrapper SI(*I); - for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { + APInt Low = + APInt::getSignedMaxValue(Cond->getType()->getScalarSizeInBits()); + APInt High = + APInt::getSignedMinValue(Cond->getType()->getScalarSizeInBits()); + + SwitchInst::CaseIt CI = SI->case_begin(); + for (auto CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); LazyValueInfo::Tristate State = LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, @@ -374,9 +415,28 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, break; } + // Get Lower/Upper bound from switch cases. + Low = APIntOps::smin(Case->getValue(), Low); + High = APIntOps::smax(Case->getValue(), High); + // Increment the case iterator since we didn't delete it. ++CI; } + + // Try to simplify default case as unreachable + if (CI == SI->case_end() && SI->getNumCases() != 0 && + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg())) { + const ConstantRange SIRange = + LVI->getConstantRange(SI->getCondition(), SI); + + // If the numbered switch cases cover the entire range of the condition, + // then the default case is not reachable. + if (SIRange.getSignedMin() == Low && SIRange.getSignedMax() == High && + SI->getNumCases() == High - Low + 1) { + createUnreachableSwitchDefault(SI, &DTU); + Changed = true; + } + } } if (Changed) @@ -690,7 +750,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) { // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can // prove that such a combination is impossible, we need to bump the bitwidth. - if (CRs[1]->contains(APInt::getAllOnesValue(OrigWidth)) && + if (CRs[1]->contains(APInt::getAllOnes(OrigWidth)) && CRs[0]->contains( APInt::getSignedMinValue(MinSignedBits).sextOrSelf(OrigWidth))) ++MinSignedBits; @@ -1023,49 +1083,48 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, // blocks. for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { bool BBChanged = false; - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *II = &*BI++; - switch (II->getOpcode()) { + for (Instruction &II : llvm::make_early_inc_range(*BB)) { + switch (II.getOpcode()) { case Instruction::Select: - BBChanged |= processSelect(cast<SelectInst>(II), LVI); + BBChanged |= processSelect(cast<SelectInst>(&II), LVI); break; case Instruction::PHI: - BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ); + BBChanged |= processPHI(cast<PHINode>(&II), LVI, DT, SQ); break; case Instruction::ICmp: case Instruction::FCmp: - BBChanged |= processCmp(cast<CmpInst>(II), LVI); + BBChanged |= processCmp(cast<CmpInst>(&II), LVI); break; case Instruction::Load: case Instruction::Store: - BBChanged |= processMemAccess(II, LVI); + BBChanged |= processMemAccess(&II, LVI); break; case Instruction::Call: case Instruction::Invoke: - BBChanged |= processCallSite(cast<CallBase>(*II), LVI); + BBChanged |= processCallSite(cast<CallBase>(II), LVI); break; case Instruction::SRem: case Instruction::SDiv: - BBChanged |= processSDivOrSRem(cast<BinaryOperator>(II), LVI); + BBChanged |= processSDivOrSRem(cast<BinaryOperator>(&II), LVI); break; case Instruction::UDiv: case Instruction::URem: - BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI); + BBChanged |= processUDivOrURem(cast<BinaryOperator>(&II), LVI); break; case Instruction::AShr: - BBChanged |= processAShr(cast<BinaryOperator>(II), LVI); + BBChanged |= processAShr(cast<BinaryOperator>(&II), LVI); break; case Instruction::SExt: - BBChanged |= processSExt(cast<SExtInst>(II), LVI); + BBChanged |= processSExt(cast<SExtInst>(&II), LVI); break; case Instruction::Add: case Instruction::Sub: case Instruction::Mul: case Instruction::Shl: - BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI); + BBChanged |= processBinOp(cast<BinaryOperator>(&II), LVI); break; case Instruction::And: - BBChanged |= processAnd(cast<BinaryOperator>(II), LVI); + BBChanged |= processAnd(cast<BinaryOperator>(&II), LVI); break; } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 90679bcac4b7..8c4523206070 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -1,9 +1,8 @@ //===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -84,8 +83,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <deque> -#include <unordered_map> -#include <unordered_set> using namespace llvm; @@ -147,8 +144,7 @@ private: Stack.push_back(SIToUnfold); while (!Stack.empty()) { - SelectInstToUnfold SIToUnfold = Stack.back(); - Stack.pop_back(); + SelectInstToUnfold SIToUnfold = Stack.pop_back_val(); std::vector<SelectInstToUnfold> NewSIsToUnfold; std::vector<BasicBlock *> NewBBs; @@ -174,6 +170,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } @@ -350,7 +347,7 @@ struct ClonedBlock { typedef std::deque<BasicBlock *> PathType; typedef std::vector<PathType> PathsType; -typedef std::set<const BasicBlock *> VisitedBlocks; +typedef SmallPtrSet<const BasicBlock *, 8> VisitedBlocks; typedef std::vector<ClonedBlock> CloneList; // This data structure keeps track of all blocks that have been cloned. If two @@ -493,7 +490,7 @@ private: } bool isPredictableValue(Value *InpVal, SmallSet<Value *, 16> &SeenValues) { - if (SeenValues.find(InpVal) != SeenValues.end()) + if (SeenValues.contains(InpVal)) return true; if (isa<ConstantInt>(InpVal)) @@ -508,7 +505,7 @@ private: void addInstToQueue(Value *Val, std::deque<Instruction *> &Q, SmallSet<Value *, 16> &SeenValues) { - if (SeenValues.find(Val) != SeenValues.end()) + if (SeenValues.contains(Val)) return; if (Instruction *I = dyn_cast<Instruction>(Val)) Q.push_back(I); @@ -533,7 +530,7 @@ private: return false; if (isa<PHINode>(SIUse) && - SIBB->getSingleSuccessor() != dyn_cast<Instruction>(SIUse)->getParent()) + SIBB->getSingleSuccessor() != cast<Instruction>(SIUse)->getParent()) return false; // If select will not be sunk during unfolding, and it is in the same basic @@ -621,13 +618,9 @@ private: // Some blocks have multiple edges to the same successor, and this set // is used to prevent a duplicate path from being generated SmallSet<BasicBlock *, 4> Successors; - - for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) { - BasicBlock *Succ = *SI; - - if (Successors.find(Succ) != Successors.end()) + for (BasicBlock *Succ : successors(BB)) { + if (!Successors.insert(Succ).second) continue; - Successors.insert(Succ); // Found a cycle through the SwitchBlock if (Succ == SwitchBlock) { @@ -636,7 +629,7 @@ private: } // We have encountered a cycle, do not get caught in it - if (Visited.find(Succ) != Visited.end()) + if (Visited.contains(Succ)) continue; PathsType SuccPaths = paths(Succ, Visited, PathDepth + 1); @@ -668,15 +661,14 @@ private: SmallSet<Value *, 16> SeenValues; while (!Stack.empty()) { - PHINode *CurPhi = Stack.back(); - Stack.pop_back(); + PHINode *CurPhi = Stack.pop_back_val(); Res[CurPhi->getParent()] = CurPhi; SeenValues.insert(CurPhi); for (Value *Incoming : CurPhi->incoming_values()) { if (Incoming == FirstDef || isa<ConstantInt>(Incoming) || - SeenValues.find(Incoming) != SeenValues.end()) { + SeenValues.contains(Incoming)) { continue; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index d22b3f409585..a8ec8bb97970 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -13,10 +13,10 @@ // in between both MemoryDefs. A bit more concretely: // // For all MemoryDefs StartDef: -// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking +// 1. Get the next dominating clobbering MemoryDef (MaybeDeadAccess) by walking // upwards. -// 2. Check that there are no reads between EarlierAccess and the StartDef by -// checking all uses starting at EarlierAccess and walking until we see +// 2. Check that there are no reads between MaybeDeadAccess and the StartDef by +// checking all uses starting at MaybeDeadAccess and walking until we see // StartDef. // 3. For each found CurrentDef, check that: // 1. There are no barrier instructions between CurrentDef and StartDef (like @@ -56,6 +56,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" @@ -78,6 +79,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> @@ -122,7 +124,7 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging", static cl::opt<unsigned> MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden, cl::desc("The number of memory instructions to scan for " - "dead store elimination (default = 100)")); + "dead store elimination (default = 150)")); static cl::opt<unsigned> MemorySSAUpwardsStepLimit( "dse-memoryssa-walklimit", cl::init(90), cl::Hidden, cl::desc("The maximum number of steps while walking upwards to find " @@ -203,39 +205,6 @@ static bool hasAnalyzableMemoryWrite(Instruction *I, return false; } -/// Return a Location stored to by the specified instruction. If isRemovable -/// returns true, this function and getLocForRead completely describe the memory -/// operations for this instruction. -static MemoryLocation getLocForWrite(Instruction *Inst, - const TargetLibraryInfo &TLI) { - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - return MemoryLocation::get(SI); - - // memcpy/memmove/memset. - if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) - return MemoryLocation::getForDest(MI); - - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - switch (II->getIntrinsicID()) { - default: - return MemoryLocation(); // Unhandled intrinsic. - case Intrinsic::init_trampoline: - return MemoryLocation::getAfter(II->getArgOperand(0)); - case Intrinsic::masked_store: - return MemoryLocation::getForArgument(II, 1, TLI); - case Intrinsic::lifetime_end: { - uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); - return MemoryLocation(II->getArgOperand(1), Len); - } - } - } - if (auto *CB = dyn_cast<CallBase>(Inst)) - // All the supported TLI functions so far happen to have dest as their - // first argument. - return MemoryLocation::getAfter(CB->getArgOperand(0)); - return MemoryLocation(); -} - /// If the value of this instruction and the memory it writes to is unused, may /// we delete this instruction? static bool isRemovable(Instruction *I) { @@ -333,147 +302,146 @@ enum OverwriteResult { } // end anonymous namespace /// Check if two instruction are masked stores that completely -/// overwrite one another. More specifically, \p Later has to -/// overwrite \p Earlier. -static OverwriteResult isMaskedStoreOverwrite(const Instruction *Later, - const Instruction *Earlier, +/// overwrite one another. More specifically, \p KillingI has to +/// overwrite \p DeadI. +static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI, + const Instruction *DeadI, BatchAAResults &AA) { - const auto *IIL = dyn_cast<IntrinsicInst>(Later); - const auto *IIE = dyn_cast<IntrinsicInst>(Earlier); - if (IIL == nullptr || IIE == nullptr) + const auto *KillingII = dyn_cast<IntrinsicInst>(KillingI); + const auto *DeadII = dyn_cast<IntrinsicInst>(DeadI); + if (KillingII == nullptr || DeadII == nullptr) return OW_Unknown; - if (IIL->getIntrinsicID() != Intrinsic::masked_store || - IIE->getIntrinsicID() != Intrinsic::masked_store) + if (KillingII->getIntrinsicID() != Intrinsic::masked_store || + DeadII->getIntrinsicID() != Intrinsic::masked_store) return OW_Unknown; // Pointers. - Value *LP = IIL->getArgOperand(1)->stripPointerCasts(); - Value *EP = IIE->getArgOperand(1)->stripPointerCasts(); - if (LP != EP && !AA.isMustAlias(LP, EP)) + Value *KillingPtr = KillingII->getArgOperand(1)->stripPointerCasts(); + Value *DeadPtr = DeadII->getArgOperand(1)->stripPointerCasts(); + if (KillingPtr != DeadPtr && !AA.isMustAlias(KillingPtr, DeadPtr)) return OW_Unknown; // Masks. - // TODO: check that Later's mask is a superset of the Earlier's mask. - if (IIL->getArgOperand(3) != IIE->getArgOperand(3)) + // TODO: check that KillingII's mask is a superset of the DeadII's mask. + if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3)) return OW_Unknown; return OW_Complete; } -/// Return 'OW_Complete' if a store to the 'Later' location completely -/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the -/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the -/// beginning of the 'Earlier' location is overwritten by 'Later'. -/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was -/// overwritten by a latter (smaller) store which doesn't write outside the big +/// Return 'OW_Complete' if a store to the 'KillingLoc' location completely +/// overwrites a store to the 'DeadLoc' location, 'OW_End' if the end of the +/// 'DeadLoc' location is completely overwritten by 'KillingLoc', 'OW_Begin' +/// if the beginning of the 'DeadLoc' location is overwritten by 'KillingLoc'. +/// 'OW_PartialEarlierWithFullLater' means that a dead (big) store was +/// overwritten by a killing (smaller) store which doesn't write outside the big /// store's memory locations. Returns 'OW_Unknown' if nothing can be determined. -/// NOTE: This function must only be called if both \p Later and \p Earlier -/// write to the same underlying object with valid \p EarlierOff and \p -/// LaterOff. -static OverwriteResult isPartialOverwrite(const MemoryLocation &Later, - const MemoryLocation &Earlier, - int64_t EarlierOff, int64_t LaterOff, - Instruction *DepWrite, +/// NOTE: This function must only be called if both \p KillingLoc and \p +/// DeadLoc belong to the same underlying object with valid \p KillingOff and +/// \p DeadOff. +static OverwriteResult isPartialOverwrite(const MemoryLocation &KillingLoc, + const MemoryLocation &DeadLoc, + int64_t KillingOff, int64_t DeadOff, + Instruction *DeadI, InstOverlapIntervalsTy &IOL) { - const uint64_t LaterSize = Later.Size.getValue(); - const uint64_t EarlierSize = Earlier.Size.getValue(); + const uint64_t KillingSize = KillingLoc.Size.getValue(); + const uint64_t DeadSize = DeadLoc.Size.getValue(); // We may now overlap, although the overlap is not complete. There might also // be other incomplete overlaps, and together, they might cover the complete - // earlier write. + // dead store. // Note: The correctness of this logic depends on the fact that this function // is not even called providing DepWrite when there are any intervening reads. if (EnablePartialOverwriteTracking && - LaterOff < int64_t(EarlierOff + EarlierSize) && - int64_t(LaterOff + LaterSize) >= EarlierOff) { + KillingOff < int64_t(DeadOff + DeadSize) && + int64_t(KillingOff + KillingSize) >= DeadOff) { // Insert our part of the overlap into the map. - auto &IM = IOL[DepWrite]; - LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff - << ", " << int64_t(EarlierOff + EarlierSize) - << ") Later [" << LaterOff << ", " - << int64_t(LaterOff + LaterSize) << ")\n"); + auto &IM = IOL[DeadI]; + LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: DeadLoc [" << DeadOff << ", " + << int64_t(DeadOff + DeadSize) << ") KillingLoc [" + << KillingOff << ", " << int64_t(KillingOff + KillingSize) + << ")\n"); // Make sure that we only insert non-overlapping intervals and combine // adjacent intervals. The intervals are stored in the map with the ending // offset as the key (in the half-open sense) and the starting offset as // the value. - int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize; + int64_t KillingIntStart = KillingOff; + int64_t KillingIntEnd = KillingOff + KillingSize; - // Find any intervals ending at, or after, LaterIntStart which start - // before LaterIntEnd. - auto ILI = IM.lower_bound(LaterIntStart); - if (ILI != IM.end() && ILI->second <= LaterIntEnd) { + // Find any intervals ending at, or after, KillingIntStart which start + // before KillingIntEnd. + auto ILI = IM.lower_bound(KillingIntStart); + if (ILI != IM.end() && ILI->second <= KillingIntEnd) { // This existing interval is overlapped with the current store somewhere - // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing + // in [KillingIntStart, KillingIntEnd]. Merge them by erasing the existing // intervals and adjusting our start and end. - LaterIntStart = std::min(LaterIntStart, ILI->second); - LaterIntEnd = std::max(LaterIntEnd, ILI->first); + KillingIntStart = std::min(KillingIntStart, ILI->second); + KillingIntEnd = std::max(KillingIntEnd, ILI->first); ILI = IM.erase(ILI); // Continue erasing and adjusting our end in case other previous // intervals are also overlapped with the current store. // - // |--- ealier 1 ---| |--- ealier 2 ---| - // |------- later---------| + // |--- dead 1 ---| |--- dead 2 ---| + // |------- killing---------| // - while (ILI != IM.end() && ILI->second <= LaterIntEnd) { - assert(ILI->second > LaterIntStart && "Unexpected interval"); - LaterIntEnd = std::max(LaterIntEnd, ILI->first); + while (ILI != IM.end() && ILI->second <= KillingIntEnd) { + assert(ILI->second > KillingIntStart && "Unexpected interval"); + KillingIntEnd = std::max(KillingIntEnd, ILI->first); ILI = IM.erase(ILI); } } - IM[LaterIntEnd] = LaterIntStart; + IM[KillingIntEnd] = KillingIntStart; ILI = IM.begin(); - if (ILI->second <= EarlierOff && - ILI->first >= int64_t(EarlierOff + EarlierSize)) { - LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" - << EarlierOff << ", " - << int64_t(EarlierOff + EarlierSize) - << ") Composite Later [" << ILI->second << ", " + if (ILI->second <= DeadOff && ILI->first >= int64_t(DeadOff + DeadSize)) { + LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: DeadLoc [" + << DeadOff << ", " << int64_t(DeadOff + DeadSize) + << ") Composite KillingLoc [" << ILI->second << ", " << ILI->first << ")\n"); ++NumCompletePartials; return OW_Complete; } } - // Check for an earlier store which writes to all the memory locations that - // the later store writes to. - if (EnablePartialStoreMerging && LaterOff >= EarlierOff && - int64_t(EarlierOff + EarlierSize) > LaterOff && - uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) { - LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" - << EarlierOff << ", " - << int64_t(EarlierOff + EarlierSize) - << ") by a later store [" << LaterOff << ", " - << int64_t(LaterOff + LaterSize) << ")\n"); + // Check for a dead store which writes to all the memory locations that + // the killing store writes to. + if (EnablePartialStoreMerging && KillingOff >= DeadOff && + int64_t(DeadOff + DeadSize) > KillingOff && + uint64_t(KillingOff - DeadOff) + KillingSize <= DeadSize) { + LLVM_DEBUG(dbgs() << "DSE: Partial overwrite a dead load [" << DeadOff + << ", " << int64_t(DeadOff + DeadSize) + << ") by a killing store [" << KillingOff << ", " + << int64_t(KillingOff + KillingSize) << ")\n"); // TODO: Maybe come up with a better name? return OW_PartialEarlierWithFullLater; } - // Another interesting case is if the later store overwrites the end of the - // earlier store. + // Another interesting case is if the killing store overwrites the end of the + // dead store. // - // |--earlier--| - // |-- later --| + // |--dead--| + // |-- killing --| // - // In this case we may want to trim the size of earlier to avoid generating - // writes to addresses which will definitely be overwritten later + // In this case we may want to trim the size of dead store to avoid + // generating stores to addresses which will definitely be overwritten killing + // store. if (!EnablePartialOverwriteTracking && - (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) && - int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize))) + (KillingOff > DeadOff && KillingOff < int64_t(DeadOff + DeadSize) && + int64_t(KillingOff + KillingSize) >= int64_t(DeadOff + DeadSize))) return OW_End; - // Finally, we also need to check if the later store overwrites the beginning - // of the earlier store. + // Finally, we also need to check if the killing store overwrites the + // beginning of the dead store. // - // |--earlier--| - // |-- later --| + // |--dead--| + // |-- killing --| // // In this case we may want to move the destination address and trim the size - // of earlier to avoid generating writes to addresses which will definitely - // be overwritten later. + // of dead store to avoid generating stores to addresses which will definitely + // be overwritten killing store. if (!EnablePartialOverwriteTracking && - (LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) { - assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) && + (KillingOff <= DeadOff && int64_t(KillingOff + KillingSize) > DeadOff)) { + assert(int64_t(KillingOff + KillingSize) < int64_t(DeadOff + DeadSize) && "Expect to be handled as OW_Complete"); return OW_Begin; } @@ -505,7 +473,12 @@ memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, BasicBlock::iterator SecondBBI(SecondI); BasicBlock *FirstBB = FirstI->getParent(); BasicBlock *SecondBB = SecondI->getParent(); - MemoryLocation MemLoc = MemoryLocation::get(SecondI); + MemoryLocation MemLoc; + if (auto *MemSet = dyn_cast<MemSetInst>(SecondI)) + MemLoc = MemoryLocation::getForDest(MemSet); + else + MemLoc = MemoryLocation::get(SecondI); + auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr); // Start checking the SecondBB. @@ -568,11 +541,11 @@ memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, return true; } -static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart, - uint64_t &EarlierSize, int64_t LaterStart, - uint64_t LaterSize, bool IsOverwriteEnd) { - auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite); - Align PrefAlign = EarlierIntrinsic->getDestAlign().valueOrOne(); +static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, + uint64_t &DeadSize, int64_t KillingStart, + uint64_t KillingSize, bool IsOverwriteEnd) { + auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI); + Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne(); // We assume that memet/memcpy operates in chunks of the "largest" native // type size and aligned on the same value. That means optimal start and size @@ -593,19 +566,19 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart, // Compute start and size of the region to remove. Make sure 'PrefAlign' is // maintained on the remaining store. if (IsOverwriteEnd) { - // Calculate required adjustment for 'LaterStart'in order to keep remaining - // store size aligned on 'PerfAlign'. + // Calculate required adjustment for 'KillingStart' in order to keep + // remaining store size aligned on 'PerfAlign'. uint64_t Off = - offsetToAlignment(uint64_t(LaterStart - EarlierStart), PrefAlign); - ToRemoveStart = LaterStart + Off; - if (EarlierSize <= uint64_t(ToRemoveStart - EarlierStart)) + offsetToAlignment(uint64_t(KillingStart - DeadStart), PrefAlign); + ToRemoveStart = KillingStart + Off; + if (DeadSize <= uint64_t(ToRemoveStart - DeadStart)) return false; - ToRemoveSize = EarlierSize - uint64_t(ToRemoveStart - EarlierStart); + ToRemoveSize = DeadSize - uint64_t(ToRemoveStart - DeadStart); } else { - ToRemoveStart = EarlierStart; - assert(LaterSize >= uint64_t(EarlierStart - LaterStart) && + ToRemoveStart = DeadStart; + assert(KillingSize >= uint64_t(DeadStart - KillingStart) && "Not overlapping accesses?"); - ToRemoveSize = LaterSize - uint64_t(EarlierStart - LaterStart); + ToRemoveSize = KillingSize - uint64_t(DeadStart - KillingStart); // Calculate required adjustment for 'ToRemoveSize'in order to keep // start of the remaining store aligned on 'PerfAlign'. uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign); @@ -619,10 +592,10 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart, } assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove"); - assert(EarlierSize > ToRemoveSize && "Can't remove more than original size"); + assert(DeadSize > ToRemoveSize && "Can't remove more than original size"); - uint64_t NewSize = EarlierSize - ToRemoveSize; - if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) { + uint64_t NewSize = DeadSize - ToRemoveSize; + if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) { // When shortening an atomic memory intrinsic, the newly shortened // length must remain an integer multiple of the element size. const uint32_t ElementSize = AMI->getElementSizeInBytes(); @@ -631,65 +604,62 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart, } LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " - << (IsOverwriteEnd ? "END" : "BEGIN") << ": " - << *EarlierWrite << "\n KILLER [" << ToRemoveStart << ", " + << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *DeadI + << "\n KILLER [" << ToRemoveStart << ", " << int64_t(ToRemoveStart + ToRemoveSize) << ")\n"); - Value *EarlierWriteLength = EarlierIntrinsic->getLength(); - Value *TrimmedLength = - ConstantInt::get(EarlierWriteLength->getType(), NewSize); - EarlierIntrinsic->setLength(TrimmedLength); - EarlierIntrinsic->setDestAlignment(PrefAlign); + Value *DeadWriteLength = DeadIntrinsic->getLength(); + Value *TrimmedLength = ConstantInt::get(DeadWriteLength->getType(), NewSize); + DeadIntrinsic->setLength(TrimmedLength); + DeadIntrinsic->setDestAlignment(PrefAlign); if (!IsOverwriteEnd) { - Value *OrigDest = EarlierIntrinsic->getRawDest(); + Value *OrigDest = DeadIntrinsic->getRawDest(); Type *Int8PtrTy = - Type::getInt8PtrTy(EarlierIntrinsic->getContext(), + Type::getInt8PtrTy(DeadIntrinsic->getContext(), OrigDest->getType()->getPointerAddressSpace()); Value *Dest = OrigDest; if (OrigDest->getType() != Int8PtrTy) - Dest = CastInst::CreatePointerCast(OrigDest, Int8PtrTy, "", EarlierWrite); + Dest = CastInst::CreatePointerCast(OrigDest, Int8PtrTy, "", DeadI); Value *Indices[1] = { - ConstantInt::get(EarlierWriteLength->getType(), ToRemoveSize)}; + ConstantInt::get(DeadWriteLength->getType(), ToRemoveSize)}; Instruction *NewDestGEP = GetElementPtrInst::CreateInBounds( - Type::getInt8Ty(EarlierIntrinsic->getContext()), - Dest, Indices, "", EarlierWrite); - NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc()); + Type::getInt8Ty(DeadIntrinsic->getContext()), Dest, Indices, "", DeadI); + NewDestGEP->setDebugLoc(DeadIntrinsic->getDebugLoc()); if (NewDestGEP->getType() != OrigDest->getType()) NewDestGEP = CastInst::CreatePointerCast(NewDestGEP, OrigDest->getType(), - "", EarlierWrite); - EarlierIntrinsic->setDest(NewDestGEP); + "", DeadI); + DeadIntrinsic->setDest(NewDestGEP); } - // Finally update start and size of earlier access. + // Finally update start and size of dead access. if (!IsOverwriteEnd) - EarlierStart += ToRemoveSize; - EarlierSize = NewSize; + DeadStart += ToRemoveSize; + DeadSize = NewSize; return true; } -static bool tryToShortenEnd(Instruction *EarlierWrite, - OverlapIntervalsTy &IntervalMap, - int64_t &EarlierStart, uint64_t &EarlierSize) { - if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite)) +static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap, + int64_t &DeadStart, uint64_t &DeadSize) { + if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI)) return false; OverlapIntervalsTy::iterator OII = --IntervalMap.end(); - int64_t LaterStart = OII->second; - uint64_t LaterSize = OII->first - LaterStart; + int64_t KillingStart = OII->second; + uint64_t KillingSize = OII->first - KillingStart; - assert(OII->first - LaterStart >= 0 && "Size expected to be positive"); + assert(OII->first - KillingStart >= 0 && "Size expected to be positive"); - if (LaterStart > EarlierStart && - // Note: "LaterStart - EarlierStart" is known to be positive due to + if (KillingStart > DeadStart && + // Note: "KillingStart - KillingStart" is known to be positive due to // preceding check. - (uint64_t)(LaterStart - EarlierStart) < EarlierSize && - // Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to + (uint64_t)(KillingStart - DeadStart) < DeadSize && + // Note: "DeadSize - (uint64_t)(KillingStart - DeadStart)" is known to // be non negative due to preceding checks. - LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) { - if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart, - LaterSize, true)) { + KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) { + if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize, + true)) { IntervalMap.erase(OII); return true; } @@ -697,28 +667,28 @@ static bool tryToShortenEnd(Instruction *EarlierWrite, return false; } -static bool tryToShortenBegin(Instruction *EarlierWrite, +static bool tryToShortenBegin(Instruction *DeadI, OverlapIntervalsTy &IntervalMap, - int64_t &EarlierStart, uint64_t &EarlierSize) { - if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite)) + int64_t &DeadStart, uint64_t &DeadSize) { + if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI)) return false; OverlapIntervalsTy::iterator OII = IntervalMap.begin(); - int64_t LaterStart = OII->second; - uint64_t LaterSize = OII->first - LaterStart; + int64_t KillingStart = OII->second; + uint64_t KillingSize = OII->first - KillingStart; - assert(OII->first - LaterStart >= 0 && "Size expected to be positive"); + assert(OII->first - KillingStart >= 0 && "Size expected to be positive"); - if (LaterStart <= EarlierStart && - // Note: "EarlierStart - LaterStart" is known to be non negative due to + if (KillingStart <= DeadStart && + // Note: "DeadStart - KillingStart" is known to be non negative due to // preceding check. - LaterSize > (uint64_t)(EarlierStart - LaterStart)) { - // Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be - // positive due to preceding checks. - assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize && + KillingSize > (uint64_t)(DeadStart - KillingStart)) { + // Note: "KillingSize - (uint64_t)(DeadStart - DeadStart)" is known to + // be positive due to preceding checks. + assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize && "Should have been handled as OW_Complete"); - if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart, - LaterSize, false)) { + if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize, + false)) { IntervalMap.erase(OII); return true; } @@ -726,71 +696,48 @@ static bool tryToShortenBegin(Instruction *EarlierWrite, return false; } -static bool removePartiallyOverlappedStores(const DataLayout &DL, - InstOverlapIntervalsTy &IOL, - const TargetLibraryInfo &TLI) { - bool Changed = false; - for (auto OI : IOL) { - Instruction *EarlierWrite = OI.first; - MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI); - assert(isRemovable(EarlierWrite) && "Expect only removable instruction"); - - const Value *Ptr = Loc.Ptr->stripPointerCasts(); - int64_t EarlierStart = 0; - uint64_t EarlierSize = Loc.Size.getValue(); - GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL); - OverlapIntervalsTy &IntervalMap = OI.second; - Changed |= - tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize); - if (IntervalMap.empty()) - continue; - Changed |= - tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize); - } - return Changed; -} - -static Constant *tryToMergePartialOverlappingStores( - StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset, - int64_t DepWriteOffset, const DataLayout &DL, BatchAAResults &AA, - DominatorTree *DT) { - - if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) && - DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) && - Later && isa<ConstantInt>(Later->getValueOperand()) && - DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) && - memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) { +static Constant * +tryToMergePartialOverlappingStores(StoreInst *KillingI, StoreInst *DeadI, + int64_t KillingOffset, int64_t DeadOffset, + const DataLayout &DL, BatchAAResults &AA, + DominatorTree *DT) { + + if (DeadI && isa<ConstantInt>(DeadI->getValueOperand()) && + DL.typeSizeEqualsStoreSize(DeadI->getValueOperand()->getType()) && + KillingI && isa<ConstantInt>(KillingI->getValueOperand()) && + DL.typeSizeEqualsStoreSize(KillingI->getValueOperand()->getType()) && + memoryIsNotModifiedBetween(DeadI, KillingI, AA, DL, DT)) { // If the store we find is: // a) partially overwritten by the store to 'Loc' - // b) the later store is fully contained in the earlier one and + // b) the killing store is fully contained in the dead one and // c) they both have a constant value // d) none of the two stores need padding - // Merge the two stores, replacing the earlier store's value with a + // Merge the two stores, replacing the dead store's value with a // merge of both values. // TODO: Deal with other constant types (vectors, etc), and probably // some mem intrinsics (if needed) - APInt EarlierValue = - cast<ConstantInt>(Earlier->getValueOperand())->getValue(); - APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue(); - unsigned LaterBits = LaterValue.getBitWidth(); - assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth()); - LaterValue = LaterValue.zext(EarlierValue.getBitWidth()); + APInt DeadValue = cast<ConstantInt>(DeadI->getValueOperand())->getValue(); + APInt KillingValue = + cast<ConstantInt>(KillingI->getValueOperand())->getValue(); + unsigned KillingBits = KillingValue.getBitWidth(); + assert(DeadValue.getBitWidth() > KillingValue.getBitWidth()); + KillingValue = KillingValue.zext(DeadValue.getBitWidth()); // Offset of the smaller store inside the larger store - unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8; - unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() - - BitOffsetDiff - LaterBits - : BitOffsetDiff; - APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount, - LShiftAmount + LaterBits); + unsigned BitOffsetDiff = (KillingOffset - DeadOffset) * 8; + unsigned LShiftAmount = + DL.isBigEndian() ? DeadValue.getBitWidth() - BitOffsetDiff - KillingBits + : BitOffsetDiff; + APInt Mask = APInt::getBitsSet(DeadValue.getBitWidth(), LShiftAmount, + LShiftAmount + KillingBits); // Clear the bits we'll be replacing, then OR with the smaller // store, shifted appropriately. - APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount); - LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *Earlier - << "\n Later: " << *Later + APInt Merged = (DeadValue & ~Mask) | (KillingValue << LShiftAmount); + LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Dead: " << *DeadI + << "\n Killing: " << *KillingI << "\n Merged Value: " << Merged << '\n'); - return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged); + return ConstantInt::get(DeadI->getValueOperand()->getType(), Merged); } return nullptr; } @@ -819,14 +766,17 @@ bool isNoopIntrinsic(Instruction *I) { } // Check if we can ignore \p D for DSE. -bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { +bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller, + const TargetLibraryInfo &TLI) { Instruction *DI = D->getMemoryInst(); // Calls that only access inaccessible memory cannot read or write any memory // locations we consider for elimination. if (auto *CB = dyn_cast<CallBase>(DI)) - if (CB->onlyAccessesInaccessibleMemory()) + if (CB->onlyAccessesInaccessibleMemory()) { + if (isAllocLikeFn(DI, &TLI)) + return false; return true; - + } // We can eliminate stores to locations not visible to the caller across // throwing instructions. if (DI->mayThrow() && !DefVisibleToCaller) @@ -841,7 +791,7 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { return true; // Skip intrinsics that do not really read or modify memory. - if (isNoopIntrinsic(D->getMemoryInst())) + if (isNoopIntrinsic(DI)) return true; return false; @@ -850,6 +800,7 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { struct DSEState { Function &F; AliasAnalysis &AA; + EarliestEscapeInfo EI; /// The single BatchAA instance that is used to cache AA queries. It will /// not be invalidated over the whole run. This is safe, because: @@ -892,30 +843,29 @@ struct DSEState { /// basic block. DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs; + // Class contains self-reference, make sure it's not copied/moved. + DSEState(const DSEState &) = delete; + DSEState &operator=(const DSEState &) = delete; + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, const LoopInfo &LI) - : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI), - DL(F.getParent()->getDataLayout()), LI(LI) {} - - static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI, const LoopInfo &LI) { - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); + : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT), + PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; for (BasicBlock *BB : post_order(&F)) { - State.PostOrderNumbers[BB] = PO++; + PostOrderNumbers[BB] = PO++; for (Instruction &I : *BB) { MemoryAccess *MA = MSSA.getMemoryAccess(&I); if (I.mayThrow() && !MA) - State.ThrowingBlocks.insert(I.getParent()); + ThrowingBlocks.insert(I.getParent()); auto *MD = dyn_cast_or_null<MemoryDef>(MA); - if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && - (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I))) - State.MemDefs.push_back(MD); + if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit && + (getLocForWriteEx(&I) || isMemTerminatorInst(&I))) + MemDefs.push_back(MD); } } @@ -925,131 +875,134 @@ struct DSEState { if (AI.hasPassPointeeByValueCopyAttr()) { // For byval, the caller doesn't know the address of the allocation. if (AI.hasByValAttr()) - State.InvisibleToCallerBeforeRet.insert({&AI, true}); - State.InvisibleToCallerAfterRet.insert({&AI, true}); + InvisibleToCallerBeforeRet.insert({&AI, true}); + InvisibleToCallerAfterRet.insert({&AI, true}); } // Collect whether there is any irreducible control flow in the function. - State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); - - return State; + ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); } - /// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI - /// instruction) completely overwrites a store to the 'Earlier' location. - /// (by \p EarlierI instruction). - /// Return OW_MaybePartial if \p Later does not completely overwrite - /// \p Earlier, but they both write to the same underlying object. In that - /// case, use isPartialOverwrite to check if \p Later partially overwrites - /// \p Earlier. Returns 'OW_Unknown' if nothing can be determined. - OverwriteResult - isOverwrite(const Instruction *LaterI, const Instruction *EarlierI, - const MemoryLocation &Later, const MemoryLocation &Earlier, - int64_t &EarlierOff, int64_t &LaterOff) { + /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p + /// KillingI instruction) completely overwrites a store to the 'DeadLoc' + /// location (by \p DeadI instruction). + /// Return OW_MaybePartial if \p KillingI does not completely overwrite + /// \p DeadI, but they both write to the same underlying object. In that + /// case, use isPartialOverwrite to check if \p KillingI partially overwrites + /// \p DeadI. Returns 'OW_Unknown' if nothing can be determined. + OverwriteResult isOverwrite(const Instruction *KillingI, + const Instruction *DeadI, + const MemoryLocation &KillingLoc, + const MemoryLocation &DeadLoc, + int64_t &KillingOff, int64_t &DeadOff) { // AliasAnalysis does not always account for loops. Limit overwrite checks - // to dependencies for which we can guarantee they are independant of any + // to dependencies for which we can guarantee they are independent of any // loops they are in. - if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier)) + if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc)) return OW_Unknown; // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll // get imprecise values here, though (except for unknown sizes). - if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) { + if (!KillingLoc.Size.isPrecise() || !DeadLoc.Size.isPrecise()) { // In case no constant size is known, try to an IR values for the number // of bytes written and check if they match. - const auto *LaterMemI = dyn_cast<MemIntrinsic>(LaterI); - const auto *EarlierMemI = dyn_cast<MemIntrinsic>(EarlierI); - if (LaterMemI && EarlierMemI) { - const Value *LaterV = LaterMemI->getLength(); - const Value *EarlierV = EarlierMemI->getLength(); - if (LaterV == EarlierV && BatchAA.isMustAlias(Earlier, Later)) + const auto *KillingMemI = dyn_cast<MemIntrinsic>(KillingI); + const auto *DeadMemI = dyn_cast<MemIntrinsic>(DeadI); + if (KillingMemI && DeadMemI) { + const Value *KillingV = KillingMemI->getLength(); + const Value *DeadV = DeadMemI->getLength(); + if (KillingV == DeadV && BatchAA.isMustAlias(DeadLoc, KillingLoc)) return OW_Complete; } // Masked stores have imprecise locations, but we can reason about them // to some extent. - return isMaskedStoreOverwrite(LaterI, EarlierI, BatchAA); + return isMaskedStoreOverwrite(KillingI, DeadI, BatchAA); } - const uint64_t LaterSize = Later.Size.getValue(); - const uint64_t EarlierSize = Earlier.Size.getValue(); + const uint64_t KillingSize = KillingLoc.Size.getValue(); + const uint64_t DeadSize = DeadLoc.Size.getValue(); // Query the alias information - AliasResult AAR = BatchAA.alias(Later, Earlier); + AliasResult AAR = BatchAA.alias(KillingLoc, DeadLoc); // If the start pointers are the same, we just have to compare sizes to see if - // the later store was larger than the earlier store. + // the killing store was larger than the dead store. if (AAR == AliasResult::MustAlias) { - // Make sure that the Later size is >= the Earlier size. - if (LaterSize >= EarlierSize) + // Make sure that the KillingSize size is >= the DeadSize size. + if (KillingSize >= DeadSize) return OW_Complete; } // If we hit a partial alias we may have a full overwrite if (AAR == AliasResult::PartialAlias && AAR.hasOffset()) { int32_t Off = AAR.getOffset(); - if (Off >= 0 && (uint64_t)Off + EarlierSize <= LaterSize) + if (Off >= 0 && (uint64_t)Off + DeadSize <= KillingSize) return OW_Complete; } - // Check to see if the later store is to the entire object (either a global, - // an alloca, or a byval/inalloca argument). If so, then it clearly + // Check to see if the killing store is to the entire object (either a + // global, an alloca, or a byval/inalloca argument). If so, then it clearly // overwrites any other store to the same object. - const Value *P1 = Earlier.Ptr->stripPointerCasts(); - const Value *P2 = Later.Ptr->stripPointerCasts(); - const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2); + const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts(); + const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts(); + const Value *DeadUndObj = getUnderlyingObject(DeadPtr); + const Value *KillingUndObj = getUnderlyingObject(KillingPtr); // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. - if (UO1 != UO2) + if (DeadUndObj != KillingUndObj) return OW_Unknown; - // If the "Later" store is to a recognizable object, get its size. - uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, &F); - if (ObjectSize != MemoryLocation::UnknownSize) - if (ObjectSize == LaterSize && ObjectSize >= EarlierSize) + // If the KillingI store is to a recognizable object, get its size. + uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); + if (KillingUndObjSize != MemoryLocation::UnknownSize) + if (KillingUndObjSize == KillingSize && KillingUndObjSize >= DeadSize) return OW_Complete; // Okay, we have stores to two completely different pointers. Try to // decompose the pointer into a "base + constant_offset" form. If the base // pointers are equal, then we can reason about the two stores. - EarlierOff = 0; - LaterOff = 0; - const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL); - const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL); - - // If the base pointers still differ, we have two completely different stores. - if (BP1 != BP2) + DeadOff = 0; + KillingOff = 0; + const Value *DeadBasePtr = + GetPointerBaseWithConstantOffset(DeadPtr, DeadOff, DL); + const Value *KillingBasePtr = + GetPointerBaseWithConstantOffset(KillingPtr, KillingOff, DL); + + // If the base pointers still differ, we have two completely different + // stores. + if (DeadBasePtr != KillingBasePtr) return OW_Unknown; - // The later access completely overlaps the earlier store if and only if - // both start and end of the earlier one is "inside" the later one: - // |<->|--earlier--|<->| - // |-------later-------| + // The killing access completely overlaps the dead store if and only if + // both start and end of the dead one is "inside" the killing one: + // |<->|--dead--|<->| + // |-----killing------| // Accesses may overlap if and only if start of one of them is "inside" // another one: - // |<->|--earlier--|<----->| - // |-------later-------| + // |<->|--dead--|<-------->| + // |-------killing--------| // OR - // |----- earlier -----| - // |<->|---later---|<----->| + // |-------dead-------| + // |<->|---killing---|<----->| // // We have to be careful here as *Off is signed while *.Size is unsigned. - // Check if the earlier access starts "not before" the later one. - if (EarlierOff >= LaterOff) { - // If the earlier access ends "not after" the later access then the earlier - // one is completely overwritten by the later one. - if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize) + // Check if the dead access starts "not before" the killing one. + if (DeadOff >= KillingOff) { + // If the dead access ends "not after" the killing access then the + // dead one is completely overwritten by the killing one. + if (uint64_t(DeadOff - KillingOff) + DeadSize <= KillingSize) return OW_Complete; - // If start of the earlier access is "before" end of the later access then - // accesses overlap. - else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize) + // If start of the dead access is "before" end of the killing access + // then accesses overlap. + else if ((uint64_t)(DeadOff - KillingOff) < KillingSize) return OW_MaybePartial; } - // If start of the later access is "before" end of the earlier access then + // If start of the killing access is "before" end of the dead access then // accesses overlap. - else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) { + else if ((uint64_t)(KillingOff - DeadOff) < DeadSize) { return OW_MaybePartial; } @@ -1106,8 +1059,13 @@ struct DSEState { LibFunc LF; if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { switch (LF) { - case LibFunc_strcpy: case LibFunc_strncpy: + if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2))) + return MemoryLocation(CB->getArgOperand(0), + LocationSize::precise(Len->getZExtValue()), + CB->getAAMetadata()); + LLVM_FALLTHROUGH; + case LibFunc_strcpy: case LibFunc_strcat: case LibFunc_strncat: return {MemoryLocation::getAfter(CB->getArgOperand(0))}; @@ -1145,8 +1103,8 @@ struct DSEState { int64_t InstWriteOffset, DepWriteOffset; if (auto CC = getLocForWriteEx(UseInst)) - return isOverwrite(UseInst, DefInst, *CC, DefLoc, DepWriteOffset, - InstWriteOffset) == OW_Complete; + return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset, + DepWriteOffset) == OW_Complete; return false; } @@ -1248,9 +1206,10 @@ struct DSEState { const Value *LocUO = getUnderlyingObject(Loc.Ptr); return BatchAA.isMustAlias(TermLoc.Ptr, LocUO); } - int64_t InstWriteOffset, DepWriteOffset; - return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DepWriteOffset, - InstWriteOffset) == OW_Complete; + int64_t InstWriteOffset = 0; + int64_t DepWriteOffset = 0; + return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, InstWriteOffset, + DepWriteOffset) == OW_Complete; } // Returns true if \p Use may read from \p DefLoc. @@ -1270,10 +1229,6 @@ struct DSEState { if (CB->onlyAccessesInaccessibleMemory()) return false; - // NOTE: For calls, the number of stores removed could be slightly improved - // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to - // be expensive compared to the benefits in practice. For now, avoid more - // expensive analysis to limit compile-time. return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc)); } @@ -1329,15 +1284,15 @@ struct DSEState { return IsGuaranteedLoopInvariantBase(Ptr); } - // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with - // no read access between them or on any other path to a function exit block - // if \p DefLoc is not accessible after the function returns. If there is no - // such MemoryDef, return None. The returned value may not (completely) - // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing - // MemoryUse (read). + // Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess, + // with no read access between them or on any other path to a function exit + // block if \p KillingLoc is not accessible after the function returns. If + // there is no such MemoryDef, return None. The returned value may not + // (completely) overwrite \p KillingLoc. Currently we bail out when we + // encounter an aliasing MemoryUse (read). Optional<MemoryAccess *> getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess, - const MemoryLocation &DefLoc, const Value *DefUO, + const MemoryLocation &KillingLoc, const Value *KillingUndObj, unsigned &ScanLimit, unsigned &WalkerStepLimit, bool IsMemTerm, unsigned &PartialLimit) { if (ScanLimit == 0 || WalkerStepLimit == 0) { @@ -1389,19 +1344,20 @@ struct DSEState { MemoryDef *CurrentDef = cast<MemoryDef>(Current); Instruction *CurrentI = CurrentDef->getMemoryInst(); - if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) + if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj), + TLI)) continue; // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing - if (mayThrowBetween(KillingI, CurrentI, DefUO)) { + if (mayThrowBetween(KillingI, CurrentI, KillingUndObj)) { LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); return None; } // Check for anything that looks like it will be a barrier to further // removal - if (isDSEBarrier(DefUO, CurrentI)) { + if (isDSEBarrier(KillingUndObj, CurrentI)) { LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); return None; } @@ -1410,14 +1366,14 @@ struct DSEState { // clobber, bail out, as the path is not profitable. We skip this check // for intrinsic calls, because the code knows how to handle memcpy // intrinsics. - if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(DefLoc, CurrentI)) + if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(KillingLoc, CurrentI)) return None; // Quick check if there are direct uses that are read-clobbers. - if (any_of(Current->uses(), [this, &DefLoc, StartAccess](Use &U) { + if (any_of(Current->uses(), [this, &KillingLoc, StartAccess](Use &U) { if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(U.getUser())) return !MSSA.dominates(StartAccess, UseOrDef) && - isReadClobber(DefLoc, UseOrDef->getMemoryInst()); + isReadClobber(KillingLoc, UseOrDef->getMemoryInst()); return false; })) { LLVM_DEBUG(dbgs() << " ... found a read clobber\n"); @@ -1450,9 +1406,10 @@ struct DSEState { if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) continue; } else { - int64_t InstWriteOffset, DepWriteOffset; - auto OR = isOverwrite(KillingI, CurrentI, DefLoc, *CurrentLoc, - DepWriteOffset, InstWriteOffset); + int64_t KillingOffset = 0; + int64_t DeadOffset = 0; + auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc, + KillingOffset, DeadOffset); // If Current does not write to the same object as KillingDef, check // the next candidate. if (OR == OW_Unknown) @@ -1473,30 +1430,25 @@ struct DSEState { }; // Accesses to objects accessible after the function returns can only be - // eliminated if the access is killed along all paths to the exit. Collect + // eliminated if the access is dead along all paths to the exit. Collect // the blocks with killing (=completely overwriting MemoryDefs) and check if - // they cover all paths from EarlierAccess to any function exit. + // they cover all paths from MaybeDeadAccess to any function exit. SmallPtrSet<Instruction *, 16> KillingDefs; KillingDefs.insert(KillingDef->getMemoryInst()); - MemoryAccess *EarlierAccess = Current; - Instruction *EarlierMemInst = - cast<MemoryDef>(EarlierAccess)->getMemoryInst(); - LLVM_DEBUG(dbgs() << " Checking for reads of " << *EarlierAccess << " (" - << *EarlierMemInst << ")\n"); + MemoryAccess *MaybeDeadAccess = Current; + MemoryLocation MaybeDeadLoc = *CurrentLoc; + Instruction *MaybeDeadI = cast<MemoryDef>(MaybeDeadAccess)->getMemoryInst(); + LLVM_DEBUG(dbgs() << " Checking for reads of " << *MaybeDeadAccess << " (" + << *MaybeDeadI << ")\n"); SmallSetVector<MemoryAccess *, 32> WorkList; auto PushMemUses = [&WorkList](MemoryAccess *Acc) { for (Use &U : Acc->uses()) WorkList.insert(cast<MemoryAccess>(U.getUser())); }; - PushMemUses(EarlierAccess); - - // Optimistically collect all accesses for reads. If we do not find any - // read clobbers, add them to the cache. - SmallPtrSet<MemoryAccess *, 16> KnownNoReads; - if (!EarlierMemInst->mayReadFromMemory()) - KnownNoReads.insert(EarlierAccess); - // Check if EarlierDef may be read. + PushMemUses(MaybeDeadAccess); + + // Check if DeadDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { MemoryAccess *UseAccess = WorkList[I]; @@ -1508,7 +1460,6 @@ struct DSEState { } --ScanLimit; NumDomMemDefChecks++; - KnownNoReads.insert(UseAccess); if (isa<MemoryPhi>(UseAccess)) { if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) { @@ -1535,7 +1486,7 @@ struct DSEState { // A memory terminator kills all preceeding MemoryDefs and all succeeding // MemoryAccesses. We do not have to check it's users. - if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) { + if (isMemTerminator(MaybeDeadLoc, MaybeDeadI, UseInst)) { LLVM_DEBUG( dbgs() << " ... skipping, memterminator invalidates following accesses\n"); @@ -1548,14 +1499,14 @@ struct DSEState { continue; } - if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(DefUO)) { + if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) { LLVM_DEBUG(dbgs() << " ... found throwing instruction\n"); return None; } // Uses which may read the original MemoryDef mean we cannot eliminate the // original MD. Stop walk. - if (isReadClobber(*CurrentLoc, UseInst)) { + if (isReadClobber(MaybeDeadLoc, UseInst)) { LLVM_DEBUG(dbgs() << " ... found read clobber\n"); return None; } @@ -1563,16 +1514,16 @@ struct DSEState { // If this worklist walks back to the original memory access (and the // pointer is not guarenteed loop invariant) then we cannot assume that a // store kills itself. - if (EarlierAccess == UseAccess && - !isGuaranteedLoopInvariant(CurrentLoc->Ptr)) { + if (MaybeDeadAccess == UseAccess && + !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) { LLVM_DEBUG(dbgs() << " ... found not loop invariant self access\n"); return None; } - // Otherwise, for the KillingDef and EarlierAccess we only have to check + // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check // if it reads the memory location. // TODO: It would probably be better to check for self-reads before // calling the function. - if (KillingDef == UseAccess || EarlierAccess == UseAccess) { + if (KillingDef == UseAccess || MaybeDeadAccess == UseAccess) { LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n"); continue; } @@ -1581,18 +1532,18 @@ struct DSEState { // the original location. Otherwise we have to check uses of *all* // MemoryDefs we discover, including non-aliasing ones. Otherwise we might // miss cases like the following - // 1 = Def(LoE) ; <----- EarlierDef stores [0,1] + // 1 = Def(LoE) ; <----- DeadDef stores [0,1] // 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3] // Use(2) ; MayAlias 2 *and* 1, loads [0, 3]. // (The Use points to the *first* Def it may alias) // 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias, // stores [0,1] if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) { - if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) { + if (isCompleteOverwrite(MaybeDeadLoc, MaybeDeadI, UseInst)) { BasicBlock *MaybeKillingBlock = UseInst->getParent(); if (PostOrderNumbers.find(MaybeKillingBlock)->second < - PostOrderNumbers.find(EarlierAccess->getBlock())->second) { - if (!isInvisibleToCallerAfterRet(DefUO)) { + PostOrderNumbers.find(MaybeDeadAccess->getBlock())->second) { + if (!isInvisibleToCallerAfterRet(KillingUndObj)) { LLVM_DEBUG(dbgs() << " ... found killing def " << *UseInst << "\n"); KillingDefs.insert(UseInst); @@ -1608,9 +1559,9 @@ struct DSEState { } // For accesses to locations visible after the function returns, make sure - // that the location is killed (=overwritten) along all paths from - // EarlierAccess to the exit. - if (!isInvisibleToCallerAfterRet(DefUO)) { + // that the location is dead (=overwritten) along all paths from + // MaybeDeadAccess to the exit. + if (!isInvisibleToCallerAfterRet(KillingUndObj)) { SmallPtrSet<BasicBlock *, 16> KillingBlocks; for (Instruction *KD : KillingDefs) KillingBlocks.insert(KD->getParent()); @@ -1619,25 +1570,24 @@ struct DSEState { // Find the common post-dominator of all killing blocks. BasicBlock *CommonPred = *KillingBlocks.begin(); - for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end(); - I != E; I++) { + for (BasicBlock *BB : llvm::drop_begin(KillingBlocks)) { if (!CommonPred) break; - CommonPred = PDT.findNearestCommonDominator(CommonPred, *I); + CommonPred = PDT.findNearestCommonDominator(CommonPred, BB); } // If CommonPred is in the set of killing blocks, just check if it - // post-dominates EarlierAccess. + // post-dominates MaybeDeadAccess. if (KillingBlocks.count(CommonPred)) { - if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) - return {EarlierAccess}; + if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) + return {MaybeDeadAccess}; return None; } - // If the common post-dominator does not post-dominate EarlierAccess, - // there is a path from EarlierAccess to an exit not going through a + // If the common post-dominator does not post-dominate MaybeDeadAccess, + // there is a path from MaybeDeadAccess to an exit not going through a // killing block. - if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) { + if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) { SetVector<BasicBlock *> WorkList; // If CommonPred is null, there are multiple exits from the function. @@ -1650,16 +1600,16 @@ struct DSEState { NumCFGTries++; // Check if all paths starting from an exit node go through one of the - // killing blocks before reaching EarlierAccess. + // killing blocks before reaching MaybeDeadAccess. for (unsigned I = 0; I < WorkList.size(); I++) { NumCFGChecks++; BasicBlock *Current = WorkList[I]; if (KillingBlocks.count(Current)) continue; - if (Current == EarlierAccess->getBlock()) + if (Current == MaybeDeadAccess->getBlock()) return None; - // EarlierAccess is reachable from the entry, so we don't have to + // MaybeDeadAccess is reachable from the entry, so we don't have to // explore unreachable blocks further. if (!DT.isReachableFromEntry(Current)) continue; @@ -1671,14 +1621,14 @@ struct DSEState { return None; } NumCFGSuccess++; - return {EarlierAccess}; + return {MaybeDeadAccess}; } return None; } - // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is + // No aliasing MemoryUses of MaybeDeadAccess found, MaybeDeadAccess is // potentially dead. - return {EarlierAccess}; + return {MaybeDeadAccess}; } // Delete dead memory defs @@ -1701,6 +1651,7 @@ struct DSEState { if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) { SkipStores.insert(MD); } + Updater.removeMemoryAccess(MA); } @@ -1715,47 +1666,49 @@ struct DSEState { NowDeadInsts.push_back(OpI); } + EI.removeInstruction(DeadInst); DeadInst->eraseFromParent(); } } - // Check for any extra throws between SI and NI that block DSE. This only - // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may - // throw are handled during the walk from one def to the next. - bool mayThrowBetween(Instruction *SI, Instruction *NI, - const Value *SILocUnd) { - // First see if we can ignore it by using the fact that SI is an + // Check for any extra throws between \p KillingI and \p DeadI that block + // DSE. This only checks extra maythrows (those that aren't MemoryDef's). + // MemoryDef that may throw are handled during the walk from one def to the + // next. + bool mayThrowBetween(Instruction *KillingI, Instruction *DeadI, + const Value *KillingUndObj) { + // First see if we can ignore it by using the fact that KillingI is an // alloca/alloca like object that is not visible to the caller during // execution of the function. - if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd)) + if (KillingUndObj && isInvisibleToCallerBeforeRet(KillingUndObj)) return false; - if (SI->getParent() == NI->getParent()) - return ThrowingBlocks.count(SI->getParent()); + if (KillingI->getParent() == DeadI->getParent()) + return ThrowingBlocks.count(KillingI->getParent()); return !ThrowingBlocks.empty(); } - // Check if \p NI acts as a DSE barrier for \p SI. The following instructions - // act as barriers: - // * A memory instruction that may throw and \p SI accesses a non-stack + // Check if \p DeadI acts as a DSE barrier for \p KillingI. The following + // instructions act as barriers: + // * A memory instruction that may throw and \p KillingI accesses a non-stack // object. // * Atomic stores stronger that monotonic. - bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) { - // If NI may throw it acts as a barrier, unless we are to an alloca/alloca - // like object that does not escape. - if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd)) + bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) { + // If DeadI may throw it acts as a barrier, unless we are to an + // alloca/alloca like object that does not escape. + if (DeadI->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) return true; - // If NI is an atomic load/store stronger than monotonic, do not try to + // If DeadI is an atomic load/store stronger than monotonic, do not try to // eliminate/reorder it. - if (NI->isAtomic()) { - if (auto *LI = dyn_cast<LoadInst>(NI)) + if (DeadI->isAtomic()) { + if (auto *LI = dyn_cast<LoadInst>(DeadI)) return isStrongerThanMonotonic(LI->getOrdering()); - if (auto *SI = dyn_cast<StoreInst>(NI)) + if (auto *SI = dyn_cast<StoreInst>(DeadI)) return isStrongerThanMonotonic(SI->getOrdering()); - if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI)) + if (auto *ARMW = dyn_cast<AtomicRMWInst>(DeadI)) return isStrongerThanMonotonic(ARMW->getOrdering()); - if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI)) + if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(DeadI)) return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) || isStrongerThanMonotonic(CmpXchg->getFailureOrdering()); llvm_unreachable("other instructions should be skipped in MemorySSA"); @@ -1776,7 +1729,6 @@ struct DSEState { continue; Instruction *DefI = Def->getMemoryInst(); - SmallVector<const Value *, 4> Pointers; auto DefLoc = getLocForWriteEx(DefI); if (!DefLoc) continue; @@ -1787,7 +1739,7 @@ struct DSEState { // uncommon. If it turns out to be important, we can use // getUnderlyingObjects here instead. const Value *UO = getUnderlyingObject(DefLoc->Ptr); - if (!UO || !isInvisibleToCallerAfterRet(UO)) + if (!isInvisibleToCallerAfterRet(UO)) continue; if (isWriteAtEndOfFunction(Def)) { @@ -1804,8 +1756,7 @@ struct DSEState { /// \returns true if \p Def is a no-op store, either because it /// directly stores back a loaded value or stores zero to a calloced object. - bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc, - const Value *DefUO) { + bool storeIsNoop(MemoryDef *Def, const Value *DefUO) { StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst()); MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst()); Constant *StoredConstant = nullptr; @@ -1816,13 +1767,78 @@ struct DSEState { if (StoredConstant && StoredConstant->isNullValue()) { auto *DefUOInst = dyn_cast<Instruction>(DefUO); - if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) { - auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst)); - // If UnderlyingDef is the clobbering access of Def, no instructions - // between them can modify the memory location. - auto *ClobberDef = - MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def); - return UnderlyingDef == ClobberDef; + if (DefUOInst) { + if (isCallocLikeFn(DefUOInst, &TLI)) { + auto *UnderlyingDef = + cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst)); + // If UnderlyingDef is the clobbering access of Def, no instructions + // between them can modify the memory location. + auto *ClobberDef = + MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def); + return UnderlyingDef == ClobberDef; + } + + if (MemSet) { + if (F.hasFnAttribute(Attribute::SanitizeMemory) || + F.hasFnAttribute(Attribute::SanitizeAddress) || + F.hasFnAttribute(Attribute::SanitizeHWAddress) || + F.getName() == "calloc") + return false; + auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUOInst)); + if (!Malloc) + return false; + auto *InnerCallee = Malloc->getCalledFunction(); + if (!InnerCallee) + return false; + LibFunc Func; + if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || + Func != LibFunc_malloc) + return false; + + auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) { + // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end + // of malloc block + auto *MallocBB = Malloc->getParent(), + *MemsetBB = Memset->getParent(); + if (MallocBB == MemsetBB) + return true; + auto *Ptr = Memset->getArgOperand(0); + auto *TI = MallocBB->getTerminator(); + ICmpInst::Predicate Pred; + BasicBlock *TrueBB, *FalseBB; + if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB, + FalseBB))) + return false; + if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB) + return false; + return true; + }; + + if (Malloc->getOperand(0) == MemSet->getLength()) { + if (shouldCreateCalloc(Malloc, MemSet) && + DT.dominates(Malloc, MemSet) && + memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) { + IRBuilder<> IRB(Malloc); + const auto &DL = Malloc->getModule()->getDataLayout(); + if (auto *Calloc = + emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1), + Malloc->getArgOperand(0), IRB, TLI)) { + MemorySSAUpdater Updater(&MSSA); + auto *LastDef = cast<MemoryDef>( + Updater.getMemorySSA()->getMemoryAccess(Malloc)); + auto *NewAccess = Updater.createMemoryAccessAfter( + cast<Instruction>(Calloc), LastDef, LastDef); + auto *NewAccessMD = cast<MemoryDef>(NewAccess); + Updater.insertDef(NewAccessMD, /*RenameUses=*/true); + Updater.removeMemoryAccess(Malloc); + Malloc->replaceAllUsesWith(Calloc); + Malloc->eraseFromParent(); + return true; + } + return false; + } + } + } } } @@ -1875,6 +1891,76 @@ struct DSEState { return false; } + + bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) { + bool Changed = false; + for (auto OI : IOL) { + Instruction *DeadI = OI.first; + MemoryLocation Loc = *getLocForWriteEx(DeadI); + assert(isRemovable(DeadI) && "Expect only removable instruction"); + + const Value *Ptr = Loc.Ptr->stripPointerCasts(); + int64_t DeadStart = 0; + uint64_t DeadSize = Loc.Size.getValue(); + GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL); + OverlapIntervalsTy &IntervalMap = OI.second; + Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize); + if (IntervalMap.empty()) + continue; + Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize); + } + return Changed; + } + + /// Eliminates writes to locations where the value that is being written + /// is already stored at the same location. + bool eliminateRedundantStoresOfExistingValues() { + bool MadeChange = false; + LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the " + "already existing value\n"); + for (auto *Def : MemDefs) { + if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) || + !isRemovable(Def->getMemoryInst())) + continue; + auto *UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess()); + if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef)) + continue; + + Instruction *DefInst = Def->getMemoryInst(); + Instruction *UpperInst = UpperDef->getMemoryInst(); + auto IsRedundantStore = [this, DefInst, + UpperInst](MemoryLocation UpperLoc) { + if (DefInst->isIdenticalTo(UpperInst)) + return true; + if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) { + if (auto *SI = dyn_cast<StoreInst>(DefInst)) { + auto MaybeDefLoc = getLocForWriteEx(DefInst); + if (!MaybeDefLoc) + return false; + int64_t InstWriteOffset = 0; + int64_t DepWriteOffset = 0; + auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc, + InstWriteOffset, DepWriteOffset); + Value *StoredByte = isBytewiseValue(SI->getValueOperand(), DL); + return StoredByte && StoredByte == MemSetI->getOperand(1) && + OR == OW_Complete; + } + } + return false; + }; + + auto MaybeUpperLoc = getLocForWriteEx(UpperInst); + if (!MaybeUpperLoc || !IsRedundantStore(*MaybeUpperLoc) || + isReadClobber(*MaybeUpperLoc, DefInst)) + continue; + LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *DefInst + << '\n'); + deleteDeadInstruction(DefInst); + NumRedundantStores++; + MadeChange = true; + } + return MadeChange; + } }; static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, @@ -1883,68 +1969,64 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, const LoopInfo &LI) { bool MadeChange = false; - DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI); + DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; if (State.SkipStores.count(KillingDef)) continue; - Instruction *SI = KillingDef->getMemoryInst(); + Instruction *KillingI = KillingDef->getMemoryInst(); - Optional<MemoryLocation> MaybeSILoc; - if (State.isMemTerminatorInst(SI)) - MaybeSILoc = State.getLocForTerminator(SI).map( + Optional<MemoryLocation> MaybeKillingLoc; + if (State.isMemTerminatorInst(KillingI)) + MaybeKillingLoc = State.getLocForTerminator(KillingI).map( [](const std::pair<MemoryLocation, bool> &P) { return P.first; }); else - MaybeSILoc = State.getLocForWriteEx(SI); + MaybeKillingLoc = State.getLocForWriteEx(KillingI); - if (!MaybeSILoc) { + if (!MaybeKillingLoc) { LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " - << *SI << "\n"); + << *KillingI << "\n"); continue; } - MemoryLocation SILoc = *MaybeSILoc; - assert(SILoc.Ptr && "SILoc should not be null"); - const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr); - - MemoryAccess *Current = KillingDef; + MemoryLocation KillingLoc = *MaybeKillingLoc; + assert(KillingLoc.Ptr && "KillingLoc should not be null"); + const Value *KillingUndObj = getUnderlyingObject(KillingLoc.Ptr); LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " - << *Current << " (" << *SI << ")\n"); + << *KillingDef << " (" << *KillingI << ")\n"); unsigned ScanLimit = MemorySSAScanLimit; unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit; unsigned PartialLimit = MemorySSAPartialStoreLimit; // Worklist of MemoryAccesses that may be killed by KillingDef. SetVector<MemoryAccess *> ToCheck; - - if (SILocUnd) - ToCheck.insert(KillingDef->getDefiningAccess()); + ToCheck.insert(KillingDef->getDefiningAccess()); bool Shortend = false; - bool IsMemTerm = State.isMemTerminatorInst(SI); + bool IsMemTerm = State.isMemTerminatorInst(KillingI); // Check if MemoryAccesses in the worklist are killed by KillingDef. for (unsigned I = 0; I < ToCheck.size(); I++) { - Current = ToCheck[I]; + MemoryAccess *Current = ToCheck[I]; if (State.SkipStores.count(Current)) continue; - Optional<MemoryAccess *> Next = State.getDomMemoryDef( - KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit, - IsMemTerm, PartialLimit); + Optional<MemoryAccess *> MaybeDeadAccess = State.getDomMemoryDef( + KillingDef, Current, KillingLoc, KillingUndObj, ScanLimit, + WalkerStepLimit, IsMemTerm, PartialLimit); - if (!Next) { + if (!MaybeDeadAccess) { LLVM_DEBUG(dbgs() << " finished walk\n"); continue; } - MemoryAccess *EarlierAccess = *Next; - LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess); - if (isa<MemoryPhi>(EarlierAccess)) { + MemoryAccess *DeadAccess = *MaybeDeadAccess; + LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess); + if (isa<MemoryPhi>(DeadAccess)) { LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n"); - for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) { + for (Value *V : cast<MemoryPhi>(DeadAccess)->incoming_values()) { MemoryAccess *IncomingAccess = cast<MemoryAccess>(V); BasicBlock *IncomingBlock = IncomingAccess->getBlock(); - BasicBlock *PhiBlock = EarlierAccess->getBlock(); + BasicBlock *PhiBlock = DeadAccess->getBlock(); // We only consider incoming MemoryAccesses that come before the // MemoryPhi. Otherwise we could discover candidates that do not @@ -1955,72 +2037,73 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, } continue; } - auto *NextDef = cast<MemoryDef>(EarlierAccess); - Instruction *NI = NextDef->getMemoryInst(); - LLVM_DEBUG(dbgs() << " (" << *NI << ")\n"); - ToCheck.insert(NextDef->getDefiningAccess()); + auto *DeadDefAccess = cast<MemoryDef>(DeadAccess); + Instruction *DeadI = DeadDefAccess->getMemoryInst(); + LLVM_DEBUG(dbgs() << " (" << *DeadI << ")\n"); + ToCheck.insert(DeadDefAccess->getDefiningAccess()); NumGetDomMemoryDefPassed++; if (!DebugCounter::shouldExecute(MemorySSACounter)) continue; - MemoryLocation NILoc = *State.getLocForWriteEx(NI); + MemoryLocation DeadLoc = *State.getLocForWriteEx(DeadI); if (IsMemTerm) { - const Value *NIUnd = getUnderlyingObject(NILoc.Ptr); - if (SILocUnd != NIUnd) + const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr); + if (KillingUndObj != DeadUndObj) continue; - LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI - << "\n KILLER: " << *SI << '\n'); - State.deleteDeadInstruction(NI); + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DeadI + << "\n KILLER: " << *KillingI << '\n'); + State.deleteDeadInstruction(DeadI); ++NumFastStores; MadeChange = true; } else { - // Check if NI overwrites SI. - int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = State.isOverwrite(SI, NI, SILoc, NILoc, - DepWriteOffset, InstWriteOffset); + // Check if DeadI overwrites KillingI. + int64_t KillingOffset = 0; + int64_t DeadOffset = 0; + OverwriteResult OR = State.isOverwrite( + KillingI, DeadI, KillingLoc, DeadLoc, KillingOffset, DeadOffset); if (OR == OW_MaybePartial) { auto Iter = State.IOLs.insert( std::make_pair<BasicBlock *, InstOverlapIntervalsTy>( - NI->getParent(), InstOverlapIntervalsTy())); + DeadI->getParent(), InstOverlapIntervalsTy())); auto &IOL = Iter.first->second; - OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset, - NI, IOL); + OR = isPartialOverwrite(KillingLoc, DeadLoc, KillingOffset, + DeadOffset, DeadI, IOL); } if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) { - auto *Earlier = dyn_cast<StoreInst>(NI); - auto *Later = dyn_cast<StoreInst>(SI); + auto *DeadSI = dyn_cast<StoreInst>(DeadI); + auto *KillingSI = dyn_cast<StoreInst>(KillingI); // We are re-using tryToMergePartialOverlappingStores, which requires - // Earlier to domiante Later. + // DeadSI to dominate DeadSI. // TODO: implement tryToMergeParialOverlappingStores using MemorySSA. - if (Earlier && Later && DT.dominates(Earlier, Later)) { + if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) { if (Constant *Merged = tryToMergePartialOverlappingStores( - Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL, + KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL, State.BatchAA, &DT)) { // Update stored value of earlier store to merged constant. - Earlier->setOperand(0, Merged); + DeadSI->setOperand(0, Merged); ++NumModifiedStores; MadeChange = true; Shortend = true; - // Remove later store and remove any outstanding overlap intervals - // for the updated store. - State.deleteDeadInstruction(Later); - auto I = State.IOLs.find(Earlier->getParent()); + // Remove killing store and remove any outstanding overlap + // intervals for the updated store. + State.deleteDeadInstruction(KillingSI); + auto I = State.IOLs.find(DeadSI->getParent()); if (I != State.IOLs.end()) - I->second.erase(Earlier); + I->second.erase(DeadSI); break; } } } if (OR == OW_Complete) { - LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI - << "\n KILLER: " << *SI << '\n'); - State.deleteDeadInstruction(NI); + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DeadI + << "\n KILLER: " << *KillingI << '\n'); + State.deleteDeadInstruction(DeadI); ++NumFastStores; MadeChange = true; } @@ -2028,10 +2111,11 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, } // Check if the store is a no-op. - if (!Shortend && isRemovable(SI) && - State.storeIsNoop(KillingDef, SILoc, SILocUnd)) { - LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *SI << '\n'); - State.deleteDeadInstruction(SI); + if (!Shortend && isRemovable(KillingI) && + State.storeIsNoop(KillingDef, KillingUndObj)) { + LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *KillingI + << '\n'); + State.deleteDeadInstruction(KillingI); NumRedundantStores++; MadeChange = true; continue; @@ -2040,8 +2124,9 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, if (EnablePartialOverwriteTracking) for (auto &KV : State.IOLs) - MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI); + MadeChange |= State.removePartiallyOverlappedStores(KV.second); + MadeChange |= State.eliminateRedundantStoresOfExistingValues(); MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); return MadeChange; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 978c6a77b8dc..90f71f7729a7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -293,7 +293,7 @@ static unsigned getHashValueImpl(SimpleValue Val) { // TODO: Extend this to handle intrinsics with >2 operands where the 1st // 2 operands are commutative. auto *II = dyn_cast<IntrinsicInst>(Inst); - if (II && II->isCommutative() && II->getNumArgOperands() == 2) { + if (II && II->isCommutative() && II->arg_size() == 2) { Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); if (LHS > RHS) std::swap(LHS, RHS); @@ -363,7 +363,7 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) { auto *LII = dyn_cast<IntrinsicInst>(LHSI); auto *RII = dyn_cast<IntrinsicInst>(RHSI); if (LII && RII && LII->getIntrinsicID() == RII->getIntrinsicID() && - LII->isCommutative() && LII->getNumArgOperands() == 2) { + LII->isCommutative() && LII->arg_size() == 2) { return LII->getArgOperand(0) == RII->getArgOperand(1) && LII->getArgOperand(1) == RII->getArgOperand(0); } @@ -1265,6 +1265,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // Skip pseudoprobe intrinsics, for the same reason as assume intrinsics. + if (match(&Inst, m_Intrinsic<Intrinsic::pseudoprobe>())) { + LLVM_DEBUG(dbgs() << "EarlyCSE skipping pseudoprobe: " << Inst << '\n'); + continue; + } + // We can skip all invariant.start intrinsics since they only read memory, // and we can forward values across it. For invariant starts without // invariant ends, we can use the fact that the invariantness never ends to @@ -1642,6 +1648,16 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, return PA; } +void EarlyCSEPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<EarlyCSEPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << "<"; + if (UseMemorySSA) + OS << "memssa"; + OS << ">"; +} + namespace { /// A simple and fast domtree-based CSE pass. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp index 8a5d4f568774..a98bb8358aef 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -256,7 +256,7 @@ void Float2IntPass::walkForwards() { Op = [](ArrayRef<ConstantRange> Ops) { assert(Ops.size() == 1 && "FNeg is a unary operator!"); unsigned Size = Ops[0].getBitWidth(); - auto Zero = ConstantRange(APInt::getNullValue(Size)); + auto Zero = ConstantRange(APInt::getZero(Size)); return Zero.sub(Ops[0]); }; break; @@ -372,7 +372,7 @@ bool Float2IntPass::validateAndTransform() { // If it does, transformation would be illegal. // // Don't count the roots, as they terminate the graphs. - if (Roots.count(I) == 0) { + if (!Roots.contains(I)) { // Set the type of the conversion while we're here. if (!ConvertedToTy) ConvertedToTy = I->getType(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp index 16368aec7c3f..00506fb86006 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp @@ -126,7 +126,7 @@ static cl::opt<uint32_t> MaxBBSpeculations( "into) when deducing if a value is fully available or not in GVN " "(default = 600)")); -struct llvm::GVN::Expression { +struct llvm::GVNPass::Expression { uint32_t opcode; bool commutative = false; Type *type = nullptr; @@ -155,17 +155,18 @@ struct llvm::GVN::Expression { namespace llvm { -template <> struct DenseMapInfo<GVN::Expression> { - static inline GVN::Expression getEmptyKey() { return ~0U; } - static inline GVN::Expression getTombstoneKey() { return ~1U; } +template <> struct DenseMapInfo<GVNPass::Expression> { + static inline GVNPass::Expression getEmptyKey() { return ~0U; } + static inline GVNPass::Expression getTombstoneKey() { return ~1U; } - static unsigned getHashValue(const GVN::Expression &e) { + static unsigned getHashValue(const GVNPass::Expression &e) { using llvm::hash_value; return static_cast<unsigned>(hash_value(e)); } - static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) { + static bool isEqual(const GVNPass::Expression &LHS, + const GVNPass::Expression &RHS) { return LHS == RHS; } }; @@ -246,7 +247,7 @@ struct llvm::gvn::AvailableValue { /// Emit code at the specified insertion point to adjust the value defined /// here to the specified type. This handles various coercion cases. Value *MaterializeAdjustedValue(LoadInst *Load, Instruction *InsertPt, - GVN &gvn) const; + GVNPass &gvn) const; }; /// Represents an AvailableValue which can be rematerialized at the end of @@ -276,7 +277,7 @@ struct llvm::gvn::AvailableValueInBlock { /// Emit code at the end of this block to adjust the value defined here to /// the specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(LoadInst *Load, GVN &gvn) const { + Value *MaterializeAdjustedValue(LoadInst *Load, GVNPass &gvn) const { return AV.MaterializeAdjustedValue(Load, BB->getTerminator(), gvn); } }; @@ -285,7 +286,7 @@ struct llvm::gvn::AvailableValueInBlock { // ValueTable Internal Functions //===----------------------------------------------------------------------===// -GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { +GVNPass::Expression GVNPass::ValueTable::createExpr(Instruction *I) { Expression e; e.type = I->getType(); e.opcode = I->getOpcode(); @@ -330,9 +331,8 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { return e; } -GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, - CmpInst::Predicate Predicate, - Value *LHS, Value *RHS) { +GVNPass::Expression GVNPass::ValueTable::createCmpExpr( + unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && "Not a comparison!"); Expression e; @@ -350,7 +350,8 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, return e; } -GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { +GVNPass::Expression +GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { assert(EI && "Not an ExtractValueInst?"); Expression e; e.type = EI->getType(); @@ -382,20 +383,21 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { // ValueTable External Functions //===----------------------------------------------------------------------===// -GVN::ValueTable::ValueTable() = default; -GVN::ValueTable::ValueTable(const ValueTable &) = default; -GVN::ValueTable::ValueTable(ValueTable &&) = default; -GVN::ValueTable::~ValueTable() = default; -GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default; +GVNPass::ValueTable::ValueTable() = default; +GVNPass::ValueTable::ValueTable(const ValueTable &) = default; +GVNPass::ValueTable::ValueTable(ValueTable &&) = default; +GVNPass::ValueTable::~ValueTable() = default; +GVNPass::ValueTable & +GVNPass::ValueTable::operator=(const GVNPass::ValueTable &Arg) = default; /// add - Insert a value into the table with a specified value number. -void GVN::ValueTable::add(Value *V, uint32_t num) { +void GVNPass::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); if (PHINode *PN = dyn_cast<PHINode>(V)) NumberingPhi[num] = PN; } -uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { +uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); uint32_t e = assignExpNewValueNum(exp).first; @@ -421,13 +423,12 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { // a normal load or store instruction. CallInst *local_cdep = dyn_cast<CallInst>(local_dep.getInst()); - if (!local_cdep || - local_cdep->getNumArgOperands() != C->getNumArgOperands()) { + if (!local_cdep || local_cdep->arg_size() != C->arg_size()) { valueNumbering[C] = nextValueNumber; return nextValueNumber++; } - for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { + for (unsigned i = 0, e = C->arg_size(); i < e; ++i) { uint32_t c_vn = lookupOrAdd(C->getArgOperand(i)); uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i)); if (c_vn != cd_vn) { @@ -477,11 +478,11 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { return nextValueNumber++; } - if (cdep->getNumArgOperands() != C->getNumArgOperands()) { + if (cdep->arg_size() != C->arg_size()) { valueNumbering[C] = nextValueNumber; return nextValueNumber++; } - for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { + for (unsigned i = 0, e = C->arg_size(); i < e; ++i) { uint32_t c_vn = lookupOrAdd(C->getArgOperand(i)); uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i)); if (c_vn != cd_vn) { @@ -500,11 +501,13 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { } /// Returns true if a value number exists for the specified value. -bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } +bool GVNPass::ValueTable::exists(Value *V) const { + return valueNumbering.count(V) != 0; +} /// lookup_or_add - Returns the value number for the specified value, assigning /// it a new number if it did not have one before. -uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { +uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) { DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V); if (VI != valueNumbering.end()) return VI->second; @@ -581,7 +584,7 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { +uint32_t GVNPass::ValueTable::lookup(Value *V, bool Verify) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); if (Verify) { assert(VI != valueNumbering.end() && "Value not numbered?"); @@ -594,15 +597,15 @@ uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { /// assigning it a new number if it did not have one before. Useful when /// we deduced the result of a comparison, but don't immediately have an /// instruction realizing that comparison to hand. -uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, - CmpInst::Predicate Predicate, - Value *LHS, Value *RHS) { +uint32_t GVNPass::ValueTable::lookupOrAddCmp(unsigned Opcode, + CmpInst::Predicate Predicate, + Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); return assignExpNewValueNum(exp).first; } /// Remove all entries from the ValueTable. -void GVN::ValueTable::clear() { +void GVNPass::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); NumberingPhi.clear(); @@ -614,7 +617,7 @@ void GVN::ValueTable::clear() { } /// Remove a value from the value numbering. -void GVN::ValueTable::erase(Value *V) { +void GVNPass::ValueTable::erase(Value *V) { uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); // If V is PHINode, V <--> value number is an one-to-one mapping. @@ -624,7 +627,7 @@ void GVN::ValueTable::erase(Value *V) { /// verifyRemoved - Verify that the value is removed from all internal data /// structures. -void GVN::ValueTable::verifyRemoved(const Value *V) const { +void GVNPass::ValueTable::verifyRemoved(const Value *V) const { for (DenseMap<Value*, uint32_t>::const_iterator I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { assert(I->first != V && "Inst still occurs in value numbering map!"); @@ -635,28 +638,28 @@ void GVN::ValueTable::verifyRemoved(const Value *V) const { // GVN Pass //===----------------------------------------------------------------------===// -bool GVN::isPREEnabled() const { +bool GVNPass::isPREEnabled() const { return Options.AllowPRE.getValueOr(GVNEnablePRE); } -bool GVN::isLoadPREEnabled() const { +bool GVNPass::isLoadPREEnabled() const { return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE); } -bool GVN::isLoadInLoopPREEnabled() const { +bool GVNPass::isLoadInLoopPREEnabled() const { return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE); } -bool GVN::isLoadPRESplitBackedgeEnabled() const { +bool GVNPass::isLoadPRESplitBackedgeEnabled() const { return Options.AllowLoadPRESplitBackedge.getValueOr( GVNEnableSplitBackedgeInLoadPRE); } -bool GVN::isMemDepEnabled() const { +bool GVNPass::isMemDepEnabled() const { return Options.AllowMemDep.getValueOr(GVNEnableMemDep); } -PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { +PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) { // FIXME: The order of evaluation of these 'getResult' calls is very // significant! Re-ordering these variables will cause GVN when run alone to // be less effective! We should fix memdep and basic-aa to not exhibit this @@ -684,8 +687,26 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { return PA; } +void GVNPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<GVNPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + + OS << "<"; + if (Options.AllowPRE != None) + OS << (Options.AllowPRE.getValue() ? "" : "no-") << "pre;"; + if (Options.AllowLoadPRE != None) + OS << (Options.AllowLoadPRE.getValue() ? "" : "no-") << "load-pre;"; + if (Options.AllowLoadPRESplitBackedge != None) + OS << (Options.AllowLoadPRESplitBackedge.getValue() ? "" : "no-") + << "split-backedge-load-pre;"; + if (Options.AllowMemDep != None) + OS << (Options.AllowMemDep.getValue() ? "" : "no-") << "memdep"; + OS << ">"; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const { +LLVM_DUMP_METHOD void GVNPass::dump(DenseMap<uint32_t, Value *> &d) const { errs() << "{\n"; for (auto &I : d) { errs() << I.first << "\n"; @@ -835,7 +856,7 @@ static bool IsValueFullyAvailableInBlock( static Value * ConstructSSAForLoadSet(LoadInst *Load, SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, - GVN &gvn) { + GVNPass &gvn) { // Check for the fully redundant, dominating load case. In this case, we can // just use the dominating value directly. if (ValuesPerBlock.size() == 1 && @@ -878,7 +899,7 @@ ConstructSSAForLoadSet(LoadInst *Load, Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, Instruction *InsertPt, - GVN &gvn) const { + GVNPass &gvn) const { Value *Res; Type *LoadTy = Load->getType(); const DataLayout &DL = Load->getModule()->getDataLayout(); @@ -1002,8 +1023,8 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo, ORE->emit(R); } -bool GVN::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, - Value *Address, AvailableValue &Res) { +bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, + Value *Address, AvailableValue &Res) { assert((DepInfo.isDef() || DepInfo.isClobber()) && "expected a local dependence"); assert(Load->isUnordered() && "rules below are incorrect for ordered access"); @@ -1137,9 +1158,9 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, return false; } -void GVN::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps, - AvailValInBlkVect &ValuesPerBlock, - UnavailBlkVect &UnavailableBlocks) { +void GVNPass::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps, + AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { // Filter out useless results (non-locals, etc). Keep track of the blocks // where we have a value available in repl, also keep track of whether we see // dependencies that produce an unknown value for the load (such as a call @@ -1182,7 +1203,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps, "post condition violation"); } -void GVN::eliminatePartiallyRedundantLoad( +void GVNPass::eliminatePartiallyRedundantLoad( LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, MapVector<BasicBlock *, Value *> &AvailableLoads) { for (const auto &AvailableLoad : AvailableLoads) { @@ -1212,8 +1233,7 @@ void GVN::eliminatePartiallyRedundantLoad( } // Transfer the old load's AA tags to the new load. - AAMDNodes Tags; - Load->getAAMetadata(Tags); + AAMDNodes Tags = Load->getAAMetadata(); if (Tags) NewLoad->setAAMetadata(Tags); @@ -1257,8 +1277,8 @@ void GVN::eliminatePartiallyRedundantLoad( }); } -bool GVN::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, - UnavailBlkVect &UnavailableBlocks) { +bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { // Okay, we have *some* definitions of the value. This means that the value // is available in some of our (transitive) predecessors. Lets think about // doing PRE of this load. This will involve inserting a new load into the @@ -1498,8 +1518,9 @@ bool GVN::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, return true; } -bool GVN::performLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, - UnavailBlkVect &UnavailableBlocks) { +bool GVNPass::performLoopLoadPRE(LoadInst *Load, + AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { if (!LI) return false; @@ -1590,7 +1611,7 @@ static void reportLoadElim(LoadInst *Load, Value *AvailableValue, /// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. -bool GVN::processNonLocalLoad(LoadInst *Load) { +bool GVNPass::processNonLocalLoad(LoadInst *Load) { // non-local speculations are not allowed under asan. if (Load->getParent()->getParent()->hasFnAttribute( Attribute::SanitizeAddress) || @@ -1622,10 +1643,8 @@ bool GVN::processNonLocalLoad(LoadInst *Load) { // If this load follows a GEP, see if we can PRE the indices before analyzing. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Load->getOperand(0))) { - for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(), - OE = GEP->idx_end(); - OI != OE; ++OI) - if (Instruction *I = dyn_cast<Instruction>(OI->get())) + for (Use &U : GEP->indices()) + if (Instruction *I = dyn_cast<Instruction>(U.get())) Changed |= performScalarPRE(I); } @@ -1673,8 +1692,11 @@ bool GVN::processNonLocalLoad(LoadInst *Load) { if (!isLoadInLoopPREEnabled() && LI && LI->getLoopFor(Load->getParent())) return Changed; - return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) || - performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks); + if (performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) || + PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks)) + return true; + + return Changed; } static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { @@ -1738,7 +1760,7 @@ static bool hasUsersIn(Value *V, BasicBlock *BB) { return false; } -bool GVN::processAssumeIntrinsic(AssumeInst *IntrinsicI) { +bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { Value *V = IntrinsicI->getArgOperand(0); if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { @@ -1882,7 +1904,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { /// Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. -bool GVN::processLoad(LoadInst *L) { +bool GVNPass::processLoad(LoadInst *L) { if (!MD) return false; @@ -1936,7 +1958,7 @@ bool GVN::processLoad(LoadInst *L) { /// Return a pair the first field showing the value number of \p Exp and the /// second field showing whether it is a value number newly created. std::pair<uint32_t, bool> -GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { +GVNPass::ValueTable::assignExpNewValueNum(Expression &Exp) { uint32_t &e = expressionNumbering[Exp]; bool CreateNewValNum = !e; if (CreateNewValNum) { @@ -1951,8 +1973,8 @@ GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { /// Return whether all the values related with the same \p num are /// defined in \p BB. -bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, - GVN &Gvn) { +bool GVNPass::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, + GVNPass &Gvn) { LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; while (Vals && Vals->BB == BB) Vals = Vals->Next; @@ -1960,9 +1982,9 @@ bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, } /// Wrap phiTranslateImpl to provide caching functionality. -uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, - const BasicBlock *PhiBlock, uint32_t Num, - GVN &Gvn) { +uint32_t GVNPass::ValueTable::phiTranslate(const BasicBlock *Pred, + const BasicBlock *PhiBlock, + uint32_t Num, GVNPass &Gvn) { auto FindRes = PhiTranslateTable.find({Num, Pred}); if (FindRes != PhiTranslateTable.end()) return FindRes->second; @@ -1973,9 +1995,10 @@ uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, // Return true if the value number \p Num and NewNum have equal value. // Return false if the result is unknown. -bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum, - const BasicBlock *Pred, - const BasicBlock *PhiBlock, GVN &Gvn) { +bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum, + const BasicBlock *Pred, + const BasicBlock *PhiBlock, + GVNPass &Gvn) { CallInst *Call = nullptr; LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; while (Vals) { @@ -2008,9 +2031,9 @@ bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum, /// Translate value number \p Num using phis, so that it has the values of /// the phis in BB. -uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, - const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn) { +uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred, + const BasicBlock *PhiBlock, + uint32_t Num, GVNPass &Gvn) { if (PHINode *PN = NumberingPhi[Num]) { for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) @@ -2063,8 +2086,8 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, /// Erase stale entry from phiTranslate cache so phiTranslate can be computed /// again. -void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num, - const BasicBlock &CurrBlock) { +void GVNPass::ValueTable::eraseTranslateCacheEntry( + uint32_t Num, const BasicBlock &CurrBlock) { for (const BasicBlock *Pred : predecessors(&CurrBlock)) PhiTranslateTable.erase({Num, Pred}); } @@ -2074,7 +2097,7 @@ void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num, // and then scan the list to find one whose block dominates the block in // question. This is fast because dominator tree queries consist of only // a few comparisons of DFS numbers. -Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { +Value *GVNPass::findLeader(const BasicBlock *BB, uint32_t num) { LeaderTableEntry Vals = LeaderTable[num]; if (!Vals.Val) return nullptr; @@ -2113,7 +2136,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } -void GVN::assignBlockRPONumber(Function &F) { +void GVNPass::assignBlockRPONumber(Function &F) { BlockRPONumber.clear(); uint32_t NextBlockNumber = 1; ReversePostOrderTraversal<Function *> RPOT(&F); @@ -2122,7 +2145,7 @@ void GVN::assignBlockRPONumber(Function &F) { InvalidBlockRPONumbers = false; } -bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const { +bool GVNPass::replaceOperandsForInBlockEquality(Instruction *Instr) const { bool Changed = false; for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { Value *Operand = Instr->getOperand(OpNum); @@ -2142,8 +2165,9 @@ bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const { /// 'RHS' everywhere in the scope. Returns whether a change was made. /// If DominatesByEdge is false, then it means that we will propagate the RHS /// value starting from the end of Root.Start. -bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, - bool DominatesByEdge) { +bool GVNPass::propagateEquality(Value *LHS, Value *RHS, + const BasicBlockEdge &Root, + bool DominatesByEdge) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; Worklist.push_back(std::make_pair(LHS, RHS)); bool Changed = false; @@ -2291,7 +2315,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, /// When calculating availability, handle an instruction /// by inserting it into the appropriate sets -bool GVN::processInstruction(Instruction *I) { +bool GVNPass::processInstruction(Instruction *I) { // Ignore dbg info intrinsics. if (isa<DbgInfoIntrinsic>(I)) return false; @@ -2432,10 +2456,10 @@ bool GVN::processInstruction(Instruction *I) { } /// runOnFunction - This is the main transformation entry point for a function. -bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, - const TargetLibraryInfo &RunTLI, AAResults &RunAA, - MemoryDependenceResults *RunMD, LoopInfo *LI, - OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) { +bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, + const TargetLibraryInfo &RunTLI, AAResults &RunAA, + MemoryDependenceResults *RunMD, LoopInfo *LI, + OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) { AC = &RunAC; DT = &RunDT; VN.setDomTree(DT); @@ -2457,10 +2481,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { - BasicBlock *BB = &*FI++; - - bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, MSSAU, MD); + for (BasicBlock &BB : llvm::make_early_inc_range(F)) { + bool removedBlock = MergeBlockIntoPredecessor(&BB, &DTU, LI, MSSAU, MD); if (removedBlock) ++NumGVNBlocks; @@ -2502,7 +2524,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, return Changed; } -bool GVN::processBlock(BasicBlock *BB) { +bool GVNPass::processBlock(BasicBlock *BB) { // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function // (and incrementing BI before processing an instruction). assert(InstrsToErase.empty() && @@ -2563,8 +2585,8 @@ bool GVN::processBlock(BasicBlock *BB) { } // Instantiate an expression in a predecessor that lacked it. -bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, - BasicBlock *Curr, unsigned int ValNo) { +bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + BasicBlock *Curr, unsigned int ValNo) { // Because we are going top-down through the block, all value numbers // will be available in the predecessor by the time we need them. Any // that weren't originally present will have been instantiated earlier @@ -2612,7 +2634,7 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, return true; } -bool GVN::performScalarPRE(Instruction *CurInst) { +bool GVNPass::performScalarPRE(Instruction *CurInst) { if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() || isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || @@ -2797,7 +2819,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { /// Perform a purely local form of PRE that looks for diamond /// control flow patterns and attempts to perform simple PRE at the join point. -bool GVN::performPRE(Function &F) { +bool GVNPass::performPRE(Function &F) { bool Changed = false; for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { // Nothing to PRE in the entry block. @@ -2824,7 +2846,7 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. -BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { +BasicBlock *GVNPass::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { // GVN does not require loop-simplify, do not try to preserve it if it is not // possible. BasicBlock *BB = SplitCriticalEdge( @@ -2840,7 +2862,7 @@ BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { /// Split critical edges found during the previous /// iteration that may enable further optimization. -bool GVN::splitCriticalEdges() { +bool GVNPass::splitCriticalEdges() { if (toSplit.empty()) return false; @@ -2860,7 +2882,7 @@ bool GVN::splitCriticalEdges() { } /// Executes one iteration of GVN -bool GVN::iterateOnFunction(Function &F) { +bool GVNPass::iterateOnFunction(Function &F) { cleanupGlobalSets(); // Top-down walk of the dominator tree @@ -2876,7 +2898,7 @@ bool GVN::iterateOnFunction(Function &F) { return Changed; } -void GVN::cleanupGlobalSets() { +void GVNPass::cleanupGlobalSets() { VN.clear(); LeaderTable.clear(); BlockRPONumber.clear(); @@ -2887,7 +2909,7 @@ void GVN::cleanupGlobalSets() { /// Verify that the specified instruction does not occur in our /// internal data structures. -void GVN::verifyRemoved(const Instruction *Inst) const { +void GVNPass::verifyRemoved(const Instruction *Inst) const { VN.verifyRemoved(Inst); // Walk through the value number scope to make sure the instruction isn't @@ -2907,7 +2929,7 @@ void GVN::verifyRemoved(const Instruction *Inst) const { /// function is to add all these blocks to "DeadBlocks". For the dead blocks' /// live successors, update their phi nodes by replacing the operands /// corresponding to dead blocks with UndefVal. -void GVN::addDeadBlock(BasicBlock *BB) { +void GVNPass::addDeadBlock(BasicBlock *BB) { SmallVector<BasicBlock *, 4> NewDead; SmallSetVector<BasicBlock *, 4> DF; @@ -2995,7 +3017,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { // dead blocks with "UndefVal" in an hope these PHIs will optimized away. // // Return true iff *NEW* dead code are found. -bool GVN::processFoldableCondBr(BranchInst *BI) { +bool GVNPass::processFoldableCondBr(BranchInst *BI) { if (!BI || BI->isUnconditional()) return false; @@ -3023,7 +3045,7 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { // associated val-num. As it normally has far more live instructions than dead // instructions, it makes more sense just to "fabricate" a val-number for the // dead code than checking if instruction involved is dead or not. -void GVN::assignValNumForDeadCode() { +void GVNPass::assignValNumForDeadCode() { for (BasicBlock *BB : DeadBlocks) { for (Instruction &Inst : *BB) { unsigned ValNum = VN.lookupOrAdd(&Inst); @@ -3078,7 +3100,7 @@ public: } private: - GVN Impl; + GVNPass Impl; }; char GVNLegacyPass::ID = 0; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp index 790d71992da4..fdc3afd9348a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -169,7 +169,7 @@ class InsnInfo { public: // Inserts I and its value number in VNtoScalars. - void insert(Instruction *I, GVN::ValueTable &VN) { + void insert(Instruction *I, GVNPass::ValueTable &VN) { // Scalar instruction. unsigned V = VN.lookupOrAdd(I); VNtoScalars[{V, InvalidVN}].push_back(I); @@ -184,7 +184,7 @@ class LoadInfo { public: // Insert Load and the value number of its memory address in VNtoLoads. - void insert(LoadInst *Load, GVN::ValueTable &VN) { + void insert(LoadInst *Load, GVNPass::ValueTable &VN) { if (Load->isSimple()) { unsigned V = VN.lookupOrAdd(Load->getPointerOperand()); VNtoLoads[{V, InvalidVN}].push_back(Load); @@ -201,7 +201,7 @@ class StoreInfo { public: // Insert the Store and a hash number of the store address and the stored // value in VNtoStores. - void insert(StoreInst *Store, GVN::ValueTable &VN) { + void insert(StoreInst *Store, GVNPass::ValueTable &VN) { if (!Store->isSimple()) return; // Hash the store address and the stored value. @@ -221,7 +221,7 @@ class CallInfo { public: // Insert Call and its value numbering in one of the VNtoCalls* containers. - void insert(CallInst *Call, GVN::ValueTable &VN) { + void insert(CallInst *Call, GVNPass::ValueTable &VN) { // A call that doesNotAccessMemory is handled as a Scalar, // onlyReadsMemory will be handled as a Load instruction, // all other calls will be handled as stores. @@ -274,7 +274,7 @@ public: unsigned int rank(const Value *V) const; private: - GVN::ValueTable VN; + GVNPass::ValueTable VN; DominatorTree *DT; PostDominatorTree *PDT; AliasAnalysis *AA; @@ -377,12 +377,12 @@ private: if (!Root) return; // Depth first walk on PDom tree to fill the CHIargs at each PDF. - RenameStackType RenameStack; for (auto Node : depth_first(Root)) { BasicBlock *BB = Node->getBlock(); if (!BB) continue; + RenameStackType RenameStack; // Collect all values in BB and push to stack. fillRenameStack(BB, ValueBBs, RenameStack); @@ -827,6 +827,8 @@ void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs, auto it1 = ValueBBs.find(BB); if (it1 != ValueBBs.end()) { // Iterate in reverse order to keep lower ranked values on the top. + LLVM_DEBUG(dbgs() << "\nVisiting: " << BB->getName() + << " for pushing instructions on stack";); for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) { // Get the value of instruction I LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 61eb4ce0ed46..82b81003ef21 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -46,6 +46,7 @@ #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" @@ -105,8 +106,10 @@ static void setCondition(Instruction *I, Value *NewCond) { } // Eliminates the guard instruction properly. -static void eliminateGuard(Instruction *GuardInst) { +static void eliminateGuard(Instruction *GuardInst, MemorySSAUpdater *MSSAU) { GuardInst->eraseFromParent(); + if (MSSAU) + MSSAU->removeMemoryAccess(GuardInst); ++GuardsEliminated; } @@ -114,6 +117,7 @@ class GuardWideningImpl { DominatorTree &DT; PostDominatorTree *PDT; LoopInfo &LI; + MemorySSAUpdater *MSSAU; /// Together, these describe the region of interest. This might be all of /// the blocks within a function, or only a given loop's blocks and preheader. @@ -269,12 +273,12 @@ class GuardWideningImpl { } public: - explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT, - LoopInfo &LI, DomTreeNode *Root, + LoopInfo &LI, MemorySSAUpdater *MSSAU, + DomTreeNode *Root, std::function<bool(BasicBlock*)> BlockFilter) - : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) - {} + : DT(DT), PDT(PDT), LI(LI), MSSAU(MSSAU), Root(Root), + BlockFilter(BlockFilter) {} /// The entry point for this pass. bool run(); @@ -313,7 +317,7 @@ bool GuardWideningImpl::run() { if (!WidenedGuards.count(I)) { assert(isa<ConstantInt>(getCondition(I)) && "Should be!"); if (isSupportedGuardInstruction(I)) - eliminateGuard(I); + eliminateGuard(I, MSSAU); else { assert(isa<BranchInst>(I) && "Eliminated something other than guard or branch?"); @@ -514,27 +518,20 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, ConstantRange CR1 = ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue()); - // SubsetIntersect is a subset of the actual mathematical intersection of - // CR0 and CR1, while SupersetIntersect is a superset of the actual - // mathematical intersection. If these two ConstantRanges are equal, then - // we know we were able to represent the actual mathematical intersection - // of CR0 and CR1, and can use the same to generate an icmp instruction. - // // Given what we're doing here and the semantics of guards, it would - // actually be correct to just use SubsetIntersect, but that may be too + // be correct to use a subset intersection, but that may be too // aggressive in cases we care about. - auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse(); - auto SupersetIntersect = CR0.intersectWith(CR1); - - APInt NewRHSAP; - CmpInst::Predicate Pred; - if (SubsetIntersect == SupersetIntersect && - SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) { - if (InsertPt) { - ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP); - Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk"); + if (Optional<ConstantRange> Intersect = CR0.exactIntersectWith(CR1)) { + APInt NewRHSAP; + CmpInst::Predicate Pred; + if (Intersect->getEquivalentICmp(Pred, NewRHSAP)) { + if (InsertPt) { + ConstantInt *NewRHS = + ConstantInt::get(Cond0->getContext(), NewRHSAP); + Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk"); + } + return true; } - return true; } } } @@ -766,12 +763,18 @@ PreservedAnalyses GuardWideningPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); - if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), - [](BasicBlock*) { return true; } ).run()) + auto *MSSAA = AM.getCachedResult<MemorySSAAnalysis>(F); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSAA) + MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAA->getMSSA()); + if (!GuardWideningImpl(DT, &PDT, LI, MSSAU ? MSSAU.get() : nullptr, + DT.getRootNode(), [](BasicBlock *) { return true; }) + .run()) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); + PA.preserve<MemorySSAAnalysis>(); return PA; } @@ -784,11 +787,17 @@ PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM, auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L.contains(BB); }; - if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB), - BlockFilter).run()) + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (AR.MSSA) + MSSAU = std::make_unique<MemorySSAUpdater>(AR.MSSA); + if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, MSSAU ? MSSAU.get() : nullptr, + AR.DT.getNode(RootBB), BlockFilter).run()) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + auto PA = getLoopPassPreservedAnalyses(); + if (AR.MSSA) + PA.preserve<MemorySSAAnalysis>(); + return PA; } namespace { @@ -805,8 +814,14 @@ struct GuardWideningLegacyPass : public FunctionPass { auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), - [](BasicBlock*) { return true; } ).run(); + auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSAWP) + MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA()); + return GuardWideningImpl(DT, &PDT, LI, MSSAU ? MSSAU.get() : nullptr, + DT.getRootNode(), + [](BasicBlock *) { return true; }) + .run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -814,6 +829,7 @@ struct GuardWideningLegacyPass : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); } }; @@ -833,13 +849,18 @@ struct LoopGuardWideningLegacyPass : public LoopPass { auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; + auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSAWP) + MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA()); + BasicBlock *RootBB = L->getLoopPredecessor(); if (!RootBB) RootBB = L->getHeader(); auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L->contains(BB); }; - return GuardWideningImpl(DT, PDT, LI, + return GuardWideningImpl(DT, PDT, LI, MSSAU ? MSSAU.get() : nullptr, DT.getNode(RootBB), BlockFilter).run(); } @@ -847,6 +868,7 @@ struct LoopGuardWideningLegacyPass : public LoopPass { AU.setPreservesCFG(); getLoopAnalysisUsage(AU); AU.addPreserved<PostDominatorTreeWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); } }; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 9ee2a2d0bf08..ae2fe2767074 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -89,6 +89,7 @@ #include <utility> using namespace llvm; +using namespace PatternMatch; #define DEBUG_TYPE "indvars" @@ -155,6 +156,10 @@ class IndVarSimplify { bool rewriteNonIntegerIVs(Loop *L); bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); + /// Try to improve our exit conditions by converting condition from signed + /// to unsigned or rotating computation out of the loop. + /// (See inline comment about why this is duplicated from simplifyAndExtend) + bool canonicalizeExitCondition(Loop *L); /// Try to eliminate loop exits based on analyzeable exit counts bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); /// Try to form loop invariant tests for loop exits by changing how many @@ -494,6 +499,7 @@ bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { MadeAnyChanges = true; PN.setIncomingValue(IncomingValIdx, ExitVal->getIncomingValue(PreheaderIdx)); + SE->forgetValue(&PN); } } } @@ -541,18 +547,18 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, return; } - if (!WI.WidestNativeType) { + if (!WI.WidestNativeType || + Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); WI.IsSigned = IsSigned; return; } - // We extend the IV to satisfy the sign of its first user, arbitrarily. - if (WI.IsSigned != IsSigned) - return; - - if (Width > SE->getTypeSizeInBits(WI.WidestNativeType)) - WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); + // We extend the IV to satisfy the sign of its user(s), or 'signed' + // if there are multiple users with both sign- and zero extensions, + // in order not to introduce nondeterministic behaviour based on the + // unspecified order of a PHI nodes' users-iterator. + WI.IsSigned |= IsSigned; } //===----------------------------------------------------------------------===// @@ -1274,9 +1280,9 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { // Skip debug info intrinsics. do { --I; - } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin()); + } while (I->isDebugOrPseudoInst() && I != Preheader->begin()); - if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin()) + if (I->isDebugOrPseudoInst() && I == Preheader->begin()) Done = true; } else { Done = true; @@ -1309,6 +1315,18 @@ static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, replaceExitCond(BI, NewCond, DeadInsts); } +static void replaceLoopPHINodesWithPreheaderValues( + Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { + assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); + auto *LoopPreheader = L->getLoopPreheader(); + auto *LoopHeader = L->getHeader(); + for (auto &PN : LoopHeader->phis()) { + auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); + PN.replaceAllUsesWith(PreheaderIncoming); + DeadInsts.emplace_back(&PN); + } +} + static void replaceWithInvariantCond( const Loop *L, BasicBlock *ExitingBB, ICmpInst::Predicate InvariantPred, const SCEV *InvariantLHS, const SCEV *InvariantRHS, SCEVExpander &Rewriter, @@ -1333,7 +1351,6 @@ static bool optimizeLoopExitWithUnknownExitCount( SmallVectorImpl<WeakTrackingVH> &DeadInsts) { ICmpInst::Predicate Pred; Value *LHS, *RHS; - using namespace PatternMatch; BasicBlock *TrueSucc, *FalseSucc; if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc)))) @@ -1394,6 +1411,140 @@ static bool optimizeLoopExitWithUnknownExitCount( return true; } +bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { + // Note: This is duplicating a particular part on SimplifyIndVars reasoning. + // We need to duplicate it because given icmp zext(small-iv), C, IVUsers + // never reaches the icmp since the zext doesn't fold to an AddRec unless + // it already has flags. The alternative to this would be to extending the + // set of "interesting" IV users to include the icmp, but doing that + // regresses results in practice by querying SCEVs before trip counts which + // rely on them which results in SCEV caching sub-optimal answers. The + // concern about caching sub-optimal results is why we only query SCEVs of + // the loop invariant RHS here. + SmallVector<BasicBlock*, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + bool Changed = false; + for (auto *ExitingBB : ExitingBlocks) { + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + assert(BI->isConditional() && "exit branch must be conditional"); + + auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); + if (!ICmp || !ICmp->hasOneUse()) + continue; + + auto *LHS = ICmp->getOperand(0); + auto *RHS = ICmp->getOperand(1); + // For the range reasoning, avoid computing SCEVs in the loop to avoid + // poisoning cache with sub-optimal results. For the must-execute case, + // this is a neccessary precondition for correctness. + if (!L->isLoopInvariant(RHS)) { + if (!L->isLoopInvariant(LHS)) + continue; + // Same logic applies for the inverse case + std::swap(LHS, RHS); + } + + // Match (icmp signed-cond zext, RHS) + Value *LHSOp = nullptr; + if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) + continue; + + const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); + const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); + const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); + auto FullCR = ConstantRange::getFull(InnerBitWidth); + FullCR = FullCR.zeroExtend(OuterBitWidth); + auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); + if (FullCR.contains(RHSCR)) { + // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus + // replace the signed condition with the unsigned version. + ICmp->setPredicate(ICmp->getUnsignedPredicate()); + Changed = true; + // Note: No SCEV invalidation needed. We've changed the predicate, but + // have not changed exit counts, or the values produced by the compare. + continue; + } + } + + // Now that we've canonicalized the condition to match the extend, + // see if we can rotate the extend out of the loop. + for (auto *ExitingBB : ExitingBlocks) { + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + assert(BI->isConditional() && "exit branch must be conditional"); + + auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); + if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned()) + continue; + + bool Swapped = false; + auto *LHS = ICmp->getOperand(0); + auto *RHS = ICmp->getOperand(1); + if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS)) + // Nothing to rotate + continue; + if (L->isLoopInvariant(LHS)) { + // Same logic applies for the inverse case until we actually pick + // which operand of the compare to update. + Swapped = true; + std::swap(LHS, RHS); + } + assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS)); + + // Match (icmp unsigned-cond zext, RHS) + // TODO: Extend to handle corresponding sext/signed-cmp case + // TODO: Extend to other invertible functions + Value *LHSOp = nullptr; + if (!match(LHS, m_ZExt(m_Value(LHSOp)))) + continue; + + // In general, we only rotate if we can do so without increasing the number + // of instructions. The exception is when we have an zext(add-rec). The + // reason for allowing this exception is that we know we need to get rid + // of the zext for SCEV to be able to compute a trip count for said loops; + // we consider the new trip count valuable enough to increase instruction + // count by one. + if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp))) + continue; + + // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS + // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS) + // when zext is loop varying and RHS is loop invariant. This converts + // loop varying work to loop-invariant work. + auto doRotateTransform = [&]() { + assert(ICmp->isUnsigned() && "must have proven unsigned already"); + auto *NewRHS = + CastInst::Create(Instruction::Trunc, RHS, LHSOp->getType(), "", + L->getLoopPreheader()->getTerminator()); + ICmp->setOperand(Swapped ? 1 : 0, LHSOp); + ICmp->setOperand(Swapped ? 0 : 1, NewRHS); + if (LHS->use_empty()) + DeadInsts.push_back(LHS); + }; + + + const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); + const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); + const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); + auto FullCR = ConstantRange::getFull(InnerBitWidth); + FullCR = FullCR.zeroExtend(OuterBitWidth); + auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); + if (FullCR.contains(RHSCR)) { + doRotateTransform(); + Changed = true; + // Note, we are leaving SCEV in an unfortunately imprecise case here + // as rotation tends to reveal information about trip counts not + // previously visible. + continue; + } + } + + return Changed; +} + bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -1499,20 +1650,18 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { // If we know we'd exit on the first iteration, rewrite the exit to // reflect this. This does not imply the loop must exit through this // exit; there may be an earlier one taken on the first iteration. - // TODO: Given we know the backedge can't be taken, we should go ahead - // and break it. Or at least, kill all the header phis and simplify. + // We know that the backedge can't be taken, so we replace all + // the header PHIs with values coming from the preheader. if (ExitCount->isZero()) { foldExit(L, ExitingBB, true, DeadInsts); + replaceLoopPHINodesWithPreheaderValues(L, DeadInsts); Changed = true; continue; } - // If we end up with a pointer exit count, bail. Note that we can end up - // with a pointer exit count for one exiting block, and not for another in - // the same loop. - if (!ExitCount->getType()->isIntegerTy() || - !MaxExitCount->getType()->isIntegerTy()) - continue; + assert(ExitCount->getType()->isIntegerTy() && + MaxExitCount->getType()->isIntegerTy() && + "Exit counts must be integers"); Type *WiderType = SE->getWiderType(MaxExitCount->getType(), ExitCount->getType()); @@ -1569,14 +1718,11 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // through *explicit* control flow. We have to eliminate the possibility of // implicit exits (see below) before we know it's truly exact. const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(ExactBTC) || - !SE->isLoopInvariant(ExactBTC, L) || - !isSafeToExpand(ExactBTC, *SE)) + if (isa<SCEVCouldNotCompute>(ExactBTC) || !isSafeToExpand(ExactBTC, *SE)) return false; - // If we end up with a pointer exit count, bail. It may be unsized. - if (!ExactBTC->getType()->isIntegerTy()) - return false; + assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); + assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); auto BadExit = [&](BasicBlock *ExitingBB) { // If our exiting block exits multiple loops, we can only rewrite the @@ -1603,15 +1749,12 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { return true; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); - if (isa<SCEVCouldNotCompute>(ExitCount) || - !SE->isLoopInvariant(ExitCount, L) || - !isSafeToExpand(ExitCount, *SE)) - return true; - - // If we end up with a pointer exit count, bail. It may be unsized. - if (!ExitCount->getType()->isIntegerTy()) + if (isa<SCEVCouldNotCompute>(ExitCount) || !isSafeToExpand(ExitCount, *SE)) return true; + assert(SE->isLoopInvariant(ExitCount, L) && + "Exit count must be loop invariant"); + assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); return false; }; @@ -1781,7 +1924,11 @@ bool IndVarSimplify::run(Loop *L) { } // Eliminate redundant IV cycles. - NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); + NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); + + // Try to convert exit conditions to unsigned and rotate computation + // out of the loop. Note: Handles invalidation internally if needed. + Changed |= canonicalizeExitCondition(L); // Try to eliminate loop exits based on analyzeable exit counts if (optimizeLoopExits(L, Rewriter)) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index f7d631f5e785..883d4afff3bd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -96,10 +96,13 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" @@ -115,6 +118,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -146,6 +150,14 @@ static const unsigned UninitializedAddressSpace = namespace { using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; +// Different from ValueToAddrSpaceMapTy, where a new addrspace is inferred on +// the *def* of a value, PredicatedAddrSpaceMapTy is map where a new +// addrspace is inferred on the *use* of a pointer. This map is introduced to +// infer addrspace from the addrspace predicate assumption built from assume +// intrinsic. In that scenario, only specific uses (under valid assumption +// context) could be inferred with a new addrspace. +using PredicatedAddrSpaceMapTy = + DenseMap<std::pair<const Value *, const Value *>, unsigned>; using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>; class InferAddressSpaces : public FunctionPass { @@ -160,6 +172,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } @@ -167,6 +181,8 @@ public: }; class InferAddressSpacesImpl { + AssumptionCache &AC; + DominatorTree *DT = nullptr; const TargetTransformInfo *TTI = nullptr; const DataLayout *DL = nullptr; @@ -174,21 +190,24 @@ class InferAddressSpacesImpl { /// possible. unsigned FlatAddrSpace = 0; - // Returns the new address space of V if updated; otherwise, returns None. - Optional<unsigned> - updateAddressSpace(const Value &V, - const ValueToAddrSpaceMapTy &InferredAddrSpace) const; + // Try to update the address space of V. If V is updated, returns true and + // false otherwise. + bool updateAddressSpace(const Value &V, + ValueToAddrSpaceMapTy &InferredAddrSpace, + PredicatedAddrSpaceMapTy &PredicatedAS) const; // Tries to infer the specific address space of each address expression in // Postorder. void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder, - ValueToAddrSpaceMapTy *InferredAddrSpace) const; + ValueToAddrSpaceMapTy &InferredAddrSpace, + PredicatedAddrSpaceMapTy &PredicatedAS) const; bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const; Value *cloneInstructionWithNewAddressSpace( Instruction *I, unsigned NewAddrSpace, const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, SmallVectorImpl<const Use *> *UndefUsesToFix) const; // Changes the flat address expressions in function F to point to specific @@ -196,7 +215,8 @@ class InferAddressSpacesImpl { // all flat expressions in the use-def graph of function F. bool rewriteWithNewAddressSpaces( const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, - const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const; + const ValueToAddrSpaceMapTy &InferredAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const; void appendsFlatAddressExpressionToPostorderStack( Value *V, PostorderStackTy &PostorderStack, @@ -211,14 +231,18 @@ class InferAddressSpacesImpl { std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const; Value *cloneValueWithNewAddressSpace( - Value *V, unsigned NewAddrSpace, - const ValueToValueMapTy &ValueWithNewAddrSpace, - SmallVectorImpl<const Use *> *UndefUsesToFix) const; + Value *V, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, + SmallVectorImpl<const Use *> *UndefUsesToFix) const; unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const; + unsigned getPredicatedAddrSpace(const Value &V, Value *Opnd) const; + public: - InferAddressSpacesImpl(const TargetTransformInfo *TTI, unsigned FlatAddrSpace) - : TTI(TTI), FlatAddrSpace(FlatAddrSpace) {} + InferAddressSpacesImpl(AssumptionCache &AC, DominatorTree *DT, + const TargetTransformInfo *TTI, unsigned FlatAddrSpace) + : AC(AC), DT(DT), TTI(TTI), FlatAddrSpace(FlatAddrSpace) {} bool run(Function &F); }; @@ -232,8 +256,12 @@ void initializeInferAddressSpacesPass(PassRegistry &); } // end namespace llvm -INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", - false, false) +INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", + false, false) // Check whether that's no-op pointer bicast using a pair of // `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over @@ -505,6 +533,7 @@ InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const { static Value *operandWithNewAddressSpaceOrCreateUndef( const Use &OperandUse, unsigned NewAddrSpace, const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, SmallVectorImpl<const Use *> *UndefUsesToFix) { Value *Operand = OperandUse.get(); @@ -517,6 +546,18 @@ static Value *operandWithNewAddressSpaceOrCreateUndef( if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) return NewOperand; + Instruction *Inst = cast<Instruction>(OperandUse.getUser()); + auto I = PredicatedAS.find(std::make_pair(Inst, Operand)); + if (I != PredicatedAS.end()) { + // Insert an addrspacecast on that operand before the user. + unsigned NewAS = I->second; + Type *NewPtrTy = PointerType::getWithSamePointeeType( + cast<PointerType>(Operand->getType()), NewAS); + auto *NewI = new AddrSpaceCastInst(Operand, NewPtrTy); + NewI->insertBefore(Inst); + return NewI; + } + UndefUsesToFix->push_back(&OperandUse); return UndefValue::get(NewPtrTy); } @@ -536,6 +577,7 @@ static Value *operandWithNewAddressSpaceOrCreateUndef( Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( Instruction *I, unsigned NewAddrSpace, const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, SmallVectorImpl<const Use *> *UndefUsesToFix) const { Type *NewPtrType = PointerType::getWithSamePointeeType( cast<PointerType>(I->getType()), NewAddrSpace); @@ -557,7 +599,7 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( assert(II->getIntrinsicID() == Intrinsic::ptrmask); Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef( II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace, - UndefUsesToFix); + PredicatedAS, UndefUsesToFix); Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr); if (Rewrite) { @@ -586,7 +628,8 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( NewPointerOperands.push_back(nullptr); else NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef( - OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix)); + OperandUse, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, + UndefUsesToFix)); } switch (I->getOpcode()) { @@ -708,9 +751,8 @@ static Value *cloneConstantExprWithNewAddressSpace( if (CE->getOpcode() == Instruction::GetElementPtr) { // Needs to specify the source type while constructing a getelementptr // constant expression. - return CE->getWithOperands( - NewOperands, TargetType, /*OnlyIfReduced=*/false, - NewOperands[0]->getType()->getPointerElementType()); + return CE->getWithOperands(NewOperands, TargetType, /*OnlyIfReduced=*/false, + cast<GEPOperator>(CE)->getSourceElementType()); } return CE->getWithOperands(NewOperands, TargetType); @@ -724,6 +766,7 @@ static Value *cloneConstantExprWithNewAddressSpace( Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace( Value *V, unsigned NewAddrSpace, const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, SmallVectorImpl<const Use *> *UndefUsesToFix) const { // All values in Postorder are flat address expressions. assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace && @@ -731,7 +774,7 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace( if (Instruction *I = dyn_cast<Instruction>(V)) { Value *NewV = cloneInstructionWithNewAddressSpace( - I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix); + I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, UndefUsesToFix); if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) { if (NewI->getParent() == nullptr) { NewI->insertBefore(I); @@ -779,46 +822,43 @@ bool InferAddressSpacesImpl::run(Function &F) { // Runs a data-flow analysis to refine the address spaces of every expression // in Postorder. ValueToAddrSpaceMapTy InferredAddrSpace; - inferAddressSpaces(Postorder, &InferredAddrSpace); + PredicatedAddrSpaceMapTy PredicatedAS; + inferAddressSpaces(Postorder, InferredAddrSpace, PredicatedAS); // Changes the address spaces of the flat address expressions who are inferred // to point to a specific address space. - return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F); + return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, + PredicatedAS, &F); } // Constants need to be tracked through RAUW to handle cases with nested // constant expressions, so wrap values in WeakTrackingVH. void InferAddressSpacesImpl::inferAddressSpaces( ArrayRef<WeakTrackingVH> Postorder, - ValueToAddrSpaceMapTy *InferredAddrSpace) const { + ValueToAddrSpaceMapTy &InferredAddrSpace, + PredicatedAddrSpaceMapTy &PredicatedAS) const { SetVector<Value *> Worklist(Postorder.begin(), Postorder.end()); // Initially, all expressions are in the uninitialized address space. for (Value *V : Postorder) - (*InferredAddrSpace)[V] = UninitializedAddressSpace; + InferredAddrSpace[V] = UninitializedAddressSpace; while (!Worklist.empty()) { Value *V = Worklist.pop_back_val(); - // Tries to update the address space of the stack top according to the + // Try to update the address space of the stack top according to the // address spaces of its operands. - LLVM_DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n'); - Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace); - if (!NewAS.hasValue()) + if (!updateAddressSpace(*V, InferredAddrSpace, PredicatedAS)) continue; - // If any updates are made, grabs its users to the worklist because - // their address spaces can also be possibly updated. - LLVM_DEBUG(dbgs() << " to " << NewAS.getValue() << '\n'); - (*InferredAddrSpace)[V] = NewAS.getValue(); for (Value *User : V->users()) { // Skip if User is already in the worklist. if (Worklist.count(User)) continue; - auto Pos = InferredAddrSpace->find(User); + auto Pos = InferredAddrSpace.find(User); // Our algorithm only updates the address spaces of flat address // expressions, which are those in InferredAddrSpace. - if (Pos == InferredAddrSpace->end()) + if (Pos == InferredAddrSpace.end()) continue; // Function updateAddressSpace moves the address space down a lattice @@ -832,10 +872,37 @@ void InferAddressSpacesImpl::inferAddressSpaces( } } -Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace( - const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const { +unsigned InferAddressSpacesImpl::getPredicatedAddrSpace(const Value &V, + Value *Opnd) const { + const Instruction *I = dyn_cast<Instruction>(&V); + if (!I) + return UninitializedAddressSpace; + + Opnd = Opnd->stripInBoundsOffsets(); + for (auto &AssumeVH : AC.assumptionsFor(Opnd)) { + if (!AssumeVH) + continue; + CallInst *CI = cast<CallInst>(AssumeVH); + if (!isValidAssumeForContext(CI, I, DT)) + continue; + + const Value *Ptr; + unsigned AS; + std::tie(Ptr, AS) = TTI->getPredicatedAddrSpace(CI->getArgOperand(0)); + if (Ptr) + return AS; + } + + return UninitializedAddressSpace; +} + +bool InferAddressSpacesImpl::updateAddressSpace( + const Value &V, ValueToAddrSpaceMapTy &InferredAddrSpace, + PredicatedAddrSpaceMapTy &PredicatedAS) const { assert(InferredAddrSpace.count(&V)); + LLVM_DEBUG(dbgs() << "Updating the address space of\n " << V << '\n'); + // The new inferred address space equals the join of the address spaces // of all its pointer operands. unsigned NewAS = UninitializedAddressSpace; @@ -861,7 +928,7 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace( // address space is known. if ((C1 && Src0AS == UninitializedAddressSpace) || (C0 && Src1AS == UninitializedAddressSpace)) - return None; + return false; if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS)) NewAS = Src1AS; @@ -878,10 +945,23 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace( // Otherwise, infer the address space from its pointer operands. for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) { auto I = InferredAddrSpace.find(PtrOperand); - unsigned OperandAS = - I != InferredAddrSpace.end() - ? I->second - : PtrOperand->getType()->getPointerAddressSpace(); + unsigned OperandAS; + if (I == InferredAddrSpace.end()) { + OperandAS = PtrOperand->getType()->getPointerAddressSpace(); + if (OperandAS == FlatAddrSpace) { + // Check AC for assumption dominating V. + unsigned AS = getPredicatedAddrSpace(V, PtrOperand); + if (AS != UninitializedAddressSpace) { + LLVM_DEBUG(dbgs() + << " deduce operand AS from the predicate addrspace " + << AS << '\n'); + OperandAS = AS; + // Record this use with the predicated AS. + PredicatedAS[std::make_pair(&V, PtrOperand)] = OperandAS; + } + } + } else + OperandAS = I->second; // join(flat, *) = flat. So we can break if NewAS is already flat. NewAS = joinAddressSpaces(NewAS, OperandAS); @@ -894,8 +974,13 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace( unsigned OldAS = InferredAddrSpace.lookup(&V); assert(OldAS != FlatAddrSpace); if (OldAS == NewAS) - return None; - return NewAS; + return false; + + // If any updates are made, grabs its users to the worklist because + // their address spaces can also be possibly updated. + LLVM_DEBUG(dbgs() << " to " << NewAS << '\n'); + InferredAddrSpace[&V] = NewAS; + return true; } /// \p returns true if \p U is the pointer operand of a memory instruction with @@ -1026,7 +1111,8 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I, bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, - const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const { + const ValueToAddrSpaceMapTy &InferredAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const { // For each address expression to be modified, creates a clone of it with its // pointer operands converted to the new address space. Since the pointer // operands are converted, the clone is naturally in the new address space by @@ -1042,8 +1128,9 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( continue; if (V->getType()->getPointerAddressSpace() != NewAddrSpace) { - Value *New = cloneValueWithNewAddressSpace( - V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix); + Value *New = + cloneValueWithNewAddressSpace(V, NewAddrSpace, ValueWithNewAddrSpace, + PredicatedAS, &UndefUsesToFix); if (New) ValueWithNewAddrSpace[V] = New; } @@ -1155,8 +1242,9 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) { unsigned NewAS = NewV->getType()->getPointerAddressSpace(); if (ASC->getDestAddressSpace() == NewAS) { - if (ASC->getType()->getPointerElementType() != - NewV->getType()->getPointerElementType()) { + if (!cast<PointerType>(ASC->getType()) + ->hasSameElementTypeAs( + cast<PointerType>(NewV->getType()))) { NewV = CastInst::Create(Instruction::BitCast, NewV, ASC->getType(), "", ASC); } @@ -1199,7 +1287,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) { if (skipFunction(F)) return false; + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; return InferAddressSpacesImpl( + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), DT, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), FlatAddrSpace) .run(F); @@ -1217,11 +1308,14 @@ InferAddressSpacesPass::InferAddressSpacesPass(unsigned AddressSpace) PreservedAnalyses InferAddressSpacesPass::run(Function &F, FunctionAnalysisManager &AM) { bool Changed = - InferAddressSpacesImpl(&AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace) + InferAddressSpacesImpl(AM.getResult<AssumptionAnalysis>(F), + AM.getCachedResult<DominatorTreeAnalysis>(F), + &AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace) .run(F); if (Changed) { PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); + PA.preserve<DominatorTreeAnalysis>(); return PA; } return PreservedAnalyses::all(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9dc3b0351346..fe9a7211967c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -331,7 +331,7 @@ bool JumpThreading::runOnFunction(Function &F) { BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(), + bool Changed = Impl.runImpl(F, TLI, TTI, LVI, AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (PrintLVIAfterJumpThreading) { dbgs() << "LVI for function '" << F.getName() << "':\n"; @@ -360,7 +360,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(), + bool Changed = runImpl(F, &TLI, &TTI, &LVI, &AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (PrintLVIAfterJumpThreading) { @@ -377,12 +377,14 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, } bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, - LazyValueInfo *LVI_, AliasAnalysis *AA_, - DomTreeUpdater *DTU_, bool HasProfileData_, + TargetTransformInfo *TTI_, LazyValueInfo *LVI_, + AliasAnalysis *AA_, DomTreeUpdater *DTU_, + bool HasProfileData_, std::unique_ptr<BlockFrequencyInfo> BFI_, std::unique_ptr<BranchProbabilityInfo> BPI_) { LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = TLI_; + TTI = TTI_; LVI = LVI_; AA = AA_; DTU = DTU_; @@ -514,7 +516,8 @@ static void replaceFoldableUses(Instruction *Cond, Value *ToVal) { /// Return the cost of duplicating a piece of this block from first non-phi /// and before StopAt instruction to thread across it. Stop scanning the block /// when exceeding the threshold. If duplication is impossible, returns ~0U. -static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, +static unsigned getJumpThreadDuplicationCost(const TargetTransformInfo *TTI, + BasicBlock *BB, Instruction *StopAt, unsigned Threshold) { assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); @@ -550,26 +553,21 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, if (Size > Threshold) return Size; - // Debugger intrinsics don't incur code size. - if (isa<DbgInfoIntrinsic>(I)) continue; - - // Pseudo-probes don't incur code size. - if (isa<PseudoProbeInst>(I)) - continue; - - // If this is a pointer->pointer bitcast, it is free. - if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) - continue; - - // Freeze instruction is free, too. - if (isa<FreezeInst>(I)) - continue; - // Bail out if this instruction gives back a token type, it is not possible // to duplicate it if it is used outside this BB. if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) return ~0U; + // Blocks with NoDuplicate are modelled as having infinite cost, so they + // are never duplicated. + if (const CallInst *CI = dyn_cast<CallInst>(I)) + if (CI->cannotDuplicate() || CI->isConvergent()) + return ~0U; + + if (TTI->getUserCost(&*I, TargetTransformInfo::TCK_SizeAndLatency) + == TargetTransformInfo::TCC_Free) + continue; + // All other instructions count for at least one unit. ++Size; @@ -578,11 +576,7 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->cannotDuplicate() || CI->isConvergent()) - // Blocks with NoDuplicate are modelled as having infinite cost, so they - // are never duplicated. - return ~0U; - else if (!isa<IntrinsicInst>(CI)) + if (!isa<IntrinsicInst>(CI)) Size += 3; else if (!CI->getType()->isVectorTy()) Size += 1; @@ -1363,8 +1357,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // If all of the loads and stores that feed the value have the same AA tags, // then we can propagate them onto any newly inserted loads. - AAMDNodes AATags; - LoadI->getAAMetadata(AATags); + AAMDNodes AATags = LoadI->getAAMetadata(); SmallPtrSet<BasicBlock*, 8> PredsScanned; @@ -2235,10 +2228,10 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB, } // Compute the cost of duplicating BB and PredBB. - unsigned BBCost = - getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); + unsigned BBCost = getJumpThreadDuplicationCost( + TTI, BB, BB->getTerminator(), BBDupThreshold); unsigned PredBBCost = getJumpThreadDuplicationCost( - PredBB, PredBB->getTerminator(), BBDupThreshold); + TTI, PredBB, PredBB->getTerminator(), BBDupThreshold); // Give up if costs are too high. We need to check BBCost and PredBBCost // individually before checking their sum because getJumpThreadDuplicationCost @@ -2346,8 +2339,8 @@ bool JumpThreadingPass::tryThreadEdge( return false; } - unsigned JumpThreadCost = - getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); + unsigned JumpThreadCost = getJumpThreadDuplicationCost( + TTI, BB, BB->getTerminator(), BBDupThreshold); if (JumpThreadCost > BBDupThreshold) { LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); @@ -2615,8 +2608,8 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( return false; } - unsigned DuplicationCost = - getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); + unsigned DuplicationCost = getJumpThreadDuplicationCost( + TTI, BB, BB->getTerminator(), BBDupThreshold); if (DuplicationCost > BBDupThreshold) { LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); @@ -3032,7 +3025,8 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard, ValueToValueMapTy UnguardedMapping, GuardedMapping; Instruction *AfterGuard = Guard->getNextNode(); - unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold); + unsigned Cost = + getJumpThreadDuplicationCost(TTI, BB, AfterGuard, BBDupThreshold); if (Cost > BBDupThreshold) return false; // Duplicate all instructions before the guard and the guard itself to the diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp index 30058df3ded5..bf714d167670 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp @@ -117,13 +117,6 @@ static cl::opt<uint32_t> MaxNumUsesTraversed( cl::desc("Max num uses visited for identifying load " "invariance in loop using invariant start (default = 8)")); -// Default value of zero implies we use the regular alias set tracker mechanism -// instead of the cross product using AA to identify aliasing of the memory -// location we are interested in. -static cl::opt<int> -LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0), - cl::desc("How many instruction to cross product using AA")); - // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to // address the same issue. This flag applies only when LICM uses MemorySSA @@ -151,7 +144,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, - TargetTransformInfo *TTI, bool &FreeInLoop); + TargetTransformInfo *TTI, bool &FreeInLoop, + bool LoopNestMode); static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, ScalarEvolution *SE, @@ -180,7 +174,7 @@ static Instruction *cloneInstructionInExitBlock( const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU); static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, - AliasSetTracker *AST, MemorySSAUpdater *MSSAU); + MemorySSAUpdater *MSSAU); static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, @@ -206,9 +200,6 @@ struct LoopInvariantCodeMotion { private: unsigned LicmMssaOptCap; unsigned LicmMssaNoAccForPromotionCap; - - std::unique_ptr<AliasSetTracker> - collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA); }; struct LegacyLICMPass : public LoopPass { @@ -228,9 +219,7 @@ struct LegacyLICMPass : public LoopPass { << L->getHeader()->getNameOrAsOperand() << "\n"); auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); - MemorySSA *MSSA = EnableMSSALoopDependency - ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA()) - : nullptr; + MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); bool hasProfileData = L->getHeader()->getParent()->hasProfileData(); BlockFrequencyInfo *BFI = hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() @@ -257,10 +246,8 @@ struct LegacyLICMPass : public LoopPass { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - if (EnableMSSALoopDependency) { - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addRequired<MemorySSAWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); getLoopAnalysisUsage(AU); LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); @@ -275,6 +262,9 @@ private: PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { + if (!AR.MSSA) + report_fatal_error("LICM requires MemorySSA (loop-mssa)"); + // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). @@ -289,8 +279,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, PA.preserve<DominatorTreeAnalysis>(); PA.preserve<LoopAnalysis>(); - if (AR.MSSA) - PA.preserve<MemorySSAAnalysis>(); + PA.preserve<MemorySSAAnalysis>(); return PA; } @@ -298,6 +287,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { + if (!AR.MSSA) + report_fatal_error("LNICM requires MemorySSA (loop-mssa)"); + // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). @@ -316,8 +308,7 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, PA.preserve<DominatorTreeAnalysis>(); PA.preserve<LoopAnalysis>(); - if (AR.MSSA) - PA.preserve<MemorySSAAnalysis>(); + PA.preserve<MemorySSAAnalysis>(); return PA; } @@ -386,10 +377,6 @@ bool LoopInvariantCodeMotion::runOnLoop( return false; } - std::unique_ptr<AliasSetTracker> CurAST; - std::unique_ptr<MemorySSAUpdater> MSSAU; - std::unique_ptr<SinkAndHoistLICMFlags> Flags; - // Don't sink stores from loops with coroutine suspend instructions. // LICM would sink instructions into the default destination of // the coroutine switch. The default destination of the switch is to @@ -406,17 +393,9 @@ bool LoopInvariantCodeMotion::runOnLoop( }); }); - if (!MSSA) { - LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n"); - CurAST = collectAliasInfoForLoop(L, LI, AA); - Flags = std::make_unique<SinkAndHoistLICMFlags>( - LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true); - } else { - LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n"); - MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); - Flags = std::make_unique<SinkAndHoistLICMFlags>( - LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA); - } + MemorySSAUpdater MSSAU(MSSA); + SinkAndHoistLICMFlags Flags(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*IsSink=*/true, L, MSSA); // Get the preheader block to move instructions into... BasicBlock *Preheader = L->getLoopPreheader(); @@ -435,14 +414,16 @@ bool LoopInvariantCodeMotion::runOnLoop( // us to sink instructions in one pass, without iteration. After sinking // instructions, we perform another pass to hoist them out of the loop. if (L->hasDedicatedExits()) - Changed |= - sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L, - CurAST.get(), MSSAU.get(), &SafetyInfo, *Flags.get(), ORE); - Flags->setIsSink(false); + Changed |= LoopNestMode + ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI, + DT, BFI, TLI, TTI, L, &MSSAU, + &SafetyInfo, Flags, ORE) + : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, + TLI, TTI, L, &MSSAU, &SafetyInfo, Flags, ORE); + Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, - CurAST.get(), MSSAU.get(), SE, &SafetyInfo, - *Flags.get(), ORE, LoopNestMode); + &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -452,7 +433,7 @@ bool LoopInvariantCodeMotion::runOnLoop( // preheader for SSA updater, so also avoid sinking when no preheader // is available. if (!DisablePromotion && Preheader && L->hasDedicatedExits() && - !Flags->tooManyMemoryAccesses() && !HasCoroSuspendInst) { + !Flags.tooManyMemoryAccesses() && !HasCoroSuspendInst) { // Figure out the loop exits and their insertion points SmallVector<BasicBlock *, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -466,55 +447,29 @@ bool LoopInvariantCodeMotion::runOnLoop( SmallVector<Instruction *, 8> InsertPts; SmallVector<MemoryAccess *, 8> MSSAInsertPts; InsertPts.reserve(ExitBlocks.size()); - if (MSSAU) - MSSAInsertPts.reserve(ExitBlocks.size()); + MSSAInsertPts.reserve(ExitBlocks.size()); for (BasicBlock *ExitBlock : ExitBlocks) { InsertPts.push_back(&*ExitBlock->getFirstInsertionPt()); - if (MSSAU) - MSSAInsertPts.push_back(nullptr); + MSSAInsertPts.push_back(nullptr); } PredIteratorCache PIC; + // Promoting one set of accesses may make the pointers for another set + // loop invariant, so run this in a loop (with the MaybePromotable set + // decreasing in size over time). bool Promoted = false; - if (CurAST.get()) { - // Loop over all of the alias sets in the tracker object. - for (AliasSet &AS : *CurAST) { - // We can promote this alias set if it has a store, if it is a "Must" - // alias set, if the pointer is loop invariant, and if we are not - // eliminating any volatile loads or stores. - if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || - !L->isLoopInvariant(AS.begin()->getValue())) - continue; - - assert( - !AS.empty() && - "Must alias set should have at least one pointer element in it!"); - - SmallSetVector<Value *, 8> PointerMustAliases; - for (const auto &ASI : AS) - PointerMustAliases.insert(ASI.getValue()); - - Promoted |= promoteLoopAccessesToScalars( - PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, - DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE); + bool LocalPromoted; + do { + LocalPromoted = false; + for (const SmallSetVector<Value *, 8> &PointerMustAliases : + collectPromotionCandidates(MSSA, AA, L)) { + LocalPromoted |= promoteLoopAccessesToScalars( + PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, + LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE); } - } else { - // Promoting one set of accesses may make the pointers for another set - // loop invariant, so run this in a loop (with the MaybePromotable set - // decreasing in size over time). - bool LocalPromoted; - do { - LocalPromoted = false; - for (const SmallSetVector<Value *, 8> &PointerMustAliases : - collectPromotionCandidates(MSSA, AA, L)) { - LocalPromoted |= promoteLoopAccessesToScalars( - PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, - LI, DT, TLI, L, /*AST*/nullptr, MSSAU.get(), &SafetyInfo, ORE); - } - Promoted |= LocalPromoted; - } while (LocalPromoted); - } + Promoted |= LocalPromoted; + } while (LocalPromoted); // Once we have promoted values across the loop body we have to // recursively reform LCSSA as any nested loop may now have values defined @@ -536,8 +491,8 @@ bool LoopInvariantCodeMotion::runOnLoop( assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) && "Parent loop not left in LCSSA form after LICM!"); - if (MSSAU.get() && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + if (VerifyMemorySSA) + MSSA->verifyMemorySSA(); if (Changed && SE) SE->forgetLoopDispositions(L); @@ -552,17 +507,15 @@ bool LoopInvariantCodeMotion::runOnLoop( bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - Loop *CurLoop, AliasSetTracker *CurAST, - MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, + Loop *CurLoop, MemorySSAUpdater *MSSAU, + ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, - OptimizationRemarkEmitter *ORE) { + OptimizationRemarkEmitter *ORE, Loop *OutermostLoop) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && - CurLoop != nullptr && SafetyInfo != nullptr && + CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr && "Unexpected input to sinkRegion."); - assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) && - "Either AliasSetTracker or MemorySSA should be initialized."); // We want to visit children before parents. We will enque all the parents // before their children in the worklist and process the worklist in reverse @@ -587,7 +540,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, salvageKnowledge(&I); salvageDebugInfo(I); ++II; - eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); + eraseInstruction(I, *SafetyInfo, MSSAU); Changed = true; continue; } @@ -598,26 +551,46 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // operands of the instruction are loop invariant. // bool FreeInLoop = false; + bool LoopNestMode = OutermostLoop != nullptr; if (!I.mayHaveSideEffects() && - isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) && - canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, - ORE)) { + isNotUsedOrFreeInLoop(I, LoopNestMode ? OutermostLoop : CurLoop, + SafetyInfo, TTI, FreeInLoop, LoopNestMode) && + canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/nullptr, MSSAU, true, + &Flags, ORE)) { if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FreeInLoop) { ++II; salvageDebugInfo(I); - eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); + eraseInstruction(I, *SafetyInfo, MSSAU); } Changed = true; } } } } - if (MSSAU && VerifyMemorySSA) + if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); return Changed; } +bool llvm::sinkRegionForLoopNest( + DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, + SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { + + bool Changed = false; + SmallPriorityWorklist<Loop *, 4> Worklist; + Worklist.insert(CurLoop); + appendLoopsToWorklist(*CurLoop, Worklist); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, + TTI, L, MSSAU, SafetyInfo, Flags, ORE, CurLoop); + } + return Changed; +} + namespace { // This is a helper class for hoistRegion to make it able to hoist control flow // in order to be able to hoist phis. The way this works is that we initially @@ -820,9 +793,8 @@ public: if (HoistTarget == InitialPreheader) { // Phis in the loop header now need to use the new preheader. InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc); - if (MSSAU) - MSSAU->wireOldPredecessorsToNewImmediatePredecessor( - HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget}); + MSSAU->wireOldPredecessorsToNewImmediatePredecessor( + HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget}); // The new preheader dominates the loop header. DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc); DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader()); @@ -884,16 +856,14 @@ static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, Loop *CurLoop, - AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE, bool LoopNestMode) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && - CurLoop != nullptr && SafetyInfo != nullptr && + CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr && "Unexpected input to hoistRegion."); - assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) && - "Either AliasSetTracker or MemorySSA should be initialized."); ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU); @@ -913,8 +883,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, if (!LoopNestMode && inSubLoop(BB, CurLoop, LI)) continue; - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { - Instruction &I = *II++; + for (Instruction &I : llvm::make_early_inc_range(*BB)) { // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to // just fold it. @@ -922,12 +891,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, &I, I.getModule()->getDataLayout(), TLI)) { LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); - if (CurAST) - CurAST->copyValue(&I, C); // FIXME MSSA: Such replacements may make accesses unoptimized (D51960). I.replaceAllUsesWith(C); if (isInstructionTriviallyDead(&I, TLI)) - eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); + eraseInstruction(I, *SafetyInfo, MSSAU); Changed = true; continue; } @@ -940,8 +907,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, - ORE) && + canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU, + true, &Flags, ORE) && worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, TLI, CurLoop, SafetyInfo, ORE, @@ -970,7 +937,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, SafetyInfo->insertInstructionTo(Product, I.getParent()); Product->insertAfter(&I); I.replaceAllUsesWith(Product); - eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); + eraseInstruction(I, *SafetyInfo, MSSAU); hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); @@ -1049,7 +1016,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, } } } - if (MSSAU && VerifyMemorySSA) + if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); // Now that we've finished hoisting make sure that LI and DT are still @@ -1101,6 +1068,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, return false; Addr = BC->getOperand(0); } + // If we've ended up at a global/constant, bail. We shouldn't be looking at + // uselists for non-local Values in a loop pass. + if (isa<Constant>(Addr)) + return false; unsigned UsesVisited = 0; // Traverse all uses of the load operand value, to see if invariant.start is @@ -1273,7 +1244,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // writes to this memory in the loop, we can hoist or sink. if (AAResults::onlyAccessesArgPointees(Behavior)) { // TODO: expand to writeable arguments - for (Value *Op : CI->arg_operands()) + for (Value *Op : CI->args()) if (Op->getType()->isPointerTy()) { bool Invalidated; if (CurAST) @@ -1443,7 +1414,8 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop, /// (e.g., a GEP can be folded into a load as an addressing mode in the loop). static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, - TargetTransformInfo *TTI, bool &FreeInLoop) { + TargetTransformInfo *TTI, bool &FreeInLoop, + bool LoopNestMode) { const auto &BlockColors = SafetyInfo->getBlockColors(); bool IsFree = isFreeInLoop(I, CurLoop, TTI); for (const User *U : I.users()) { @@ -1460,6 +1432,15 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, if (!BlockColors.empty() && BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1) return false; + + if (LoopNestMode) { + while (isa<PHINode>(UI) && UI->hasOneUser() && + UI->getNumOperands() == 1) { + if (!CurLoop->contains(UI)) + break; + UI = cast<Instruction>(UI->user_back()); + } + } } if (CurLoop->contains(UI)) { @@ -1546,9 +1527,7 @@ static Instruction *cloneInstructionInExitBlock( } static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, - AliasSetTracker *AST, MemorySSAUpdater *MSSAU) { - if (AST) - AST->deleteValue(&I); + MemorySSAUpdater *MSSAU) { if (MSSAU) MSSAU->removeMemoryAccess(&I); SafetyInfo.removeInstruction(&I); @@ -1599,8 +1578,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { // predecessor fairly simple. if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad()) return false; - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { - BasicBlock *BBPred = *PI; + for (BasicBlock *BBPred : predecessors(BB)) { if (isa<IndirectBrInst>(BBPred->getTerminator()) || isa<CallBrInst>(BBPred->getTerminator())) return false; @@ -1786,7 +1764,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, Instruction *New = sinkThroughTriviallyReplaceablePHI( PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); PN->replaceAllUsesWith(New); - eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr); + eraseInstruction(*PN, *SafetyInfo, nullptr); Changed = true; } return Changed; @@ -1875,11 +1853,10 @@ class LoopPromoter : public LoadAndStorePromoter { SmallVectorImpl<Instruction *> &LoopInsertPts; SmallVectorImpl<MemoryAccess *> &MSSAInsertPts; PredIteratorCache &PredCache; - AliasSetTracker *AST; MemorySSAUpdater *MSSAU; LoopInfo &LI; DebugLoc DL; - int Alignment; + Align Alignment; bool UnorderedAtomic; AAMDNodes AATags; ICFLoopSafetyInfo &SafetyInfo; @@ -1907,13 +1884,13 @@ public: SmallVectorImpl<BasicBlock *> &LEB, SmallVectorImpl<Instruction *> &LIP, SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC, - AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li, - DebugLoc dl, int alignment, bool UnorderedAtomic, - const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo) + MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl, + Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags, + ICFLoopSafetyInfo &SafetyInfo) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), - PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)), - Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), + PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)), + Alignment(Alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), SafetyInfo(SafetyInfo) {} bool isInstInList(Instruction *I, @@ -1940,39 +1917,29 @@ public: StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); if (UnorderedAtomic) NewSI->setOrdering(AtomicOrdering::Unordered); - NewSI->setAlignment(Align(Alignment)); + NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); if (AATags) NewSI->setAAMetadata(AATags); - if (MSSAU) { - MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i]; - MemoryAccess *NewMemAcc; - if (!MSSAInsertPoint) { - NewMemAcc = MSSAU->createMemoryAccessInBB( - NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning); - } else { - NewMemAcc = - MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint); - } - MSSAInsertPts[i] = NewMemAcc; - MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); - // FIXME: true for safety, false may still be correct. + MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i]; + MemoryAccess *NewMemAcc; + if (!MSSAInsertPoint) { + NewMemAcc = MSSAU->createMemoryAccessInBB( + NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning); + } else { + NewMemAcc = + MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint); } + MSSAInsertPts[i] = NewMemAcc; + MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); + // FIXME: true for safety, false may still be correct. } } - void replaceLoadWithValue(LoadInst *LI, Value *V) const override { - // Update alias analysis. - if (AST) - AST->copyValue(LI, V); - } void instructionDeleted(Instruction *I) const override { SafetyInfo.removeInstruction(I); - if (AST) - AST->deleteValue(I); - if (MSSAU) - MSSAU->removeMemoryAccess(I); + MSSAU->removeMemoryAccess(I); } }; @@ -2023,8 +1990,8 @@ bool llvm::promoteLoopAccessesToScalars( SmallVectorImpl<Instruction *> &InsertPts, SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { + Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE) { // Verify inputs. assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -2189,9 +2156,9 @@ bool llvm::promoteLoopAccessesToScalars( // Merge the AA tags. if (LoopUses.empty()) { // On the first load/store, just take its AA tags. - UI->getAAMetadata(AATags); + AATags = UI->getAAMetadata(); } else if (AATags) { - UI->getAAMetadata(AATags, /* Merge = */ true); + AATags = AATags.merge(UI->getAAMetadata()); } LoopUses.push_back(UI); @@ -2256,9 +2223,8 @@ bool llvm::promoteLoopAccessesToScalars( SmallVector<PHINode *, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL, - Alignment.value(), SawUnorderedAtomic, AATags, - *SafetyInfo); + InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL, + Alignment, SawUnorderedAtomic, AATags, *SafetyInfo); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. @@ -2273,24 +2239,22 @@ bool llvm::promoteLoopAccessesToScalars( PreheaderLoad->setAAMetadata(AATags); SSA.AddAvailableValue(Preheader, PreheaderLoad); - if (MSSAU) { - MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( - PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); - MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); - MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); - } + MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( + PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); + MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); + MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); - if (MSSAU && VerifyMemorySSA) + if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); // Rewrite all the loads in the loop and remember all the definitions from // stores in the loop. Promoter.run(LoopUses); - if (MSSAU && VerifyMemorySSA) + if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); // If the SSAUpdater didn't use the load in the preheader, just zap it now. if (PreheaderLoad->use_empty()) - eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU); + eraseInstruction(*PreheaderLoad, *SafetyInfo, MSSAU); return true; } @@ -2356,71 +2320,10 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) { return Result; } -/// Returns an owning pointer to an alias set which incorporates aliasing info -/// from L and all subloops of L. -std::unique_ptr<AliasSetTracker> -LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI, - AAResults *AA) { - auto CurAST = std::make_unique<AliasSetTracker>(*AA); - - // Add everything from all the sub loops. - for (Loop *InnerL : L->getSubLoops()) - for (BasicBlock *BB : InnerL->blocks()) - CurAST->add(*BB); - - // And merge in this loop (without anything from inner loops). - for (BasicBlock *BB : L->blocks()) - if (LI->getLoopFor(BB) == L) - CurAST->add(*BB); - - return CurAST; -} - static bool pointerInvalidatedByLoop(MemoryLocation MemLoc, AliasSetTracker *CurAST, Loop *CurLoop, AAResults *AA) { - // First check to see if any of the basic blocks in CurLoop invalidate *V. - bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod(); - - if (!isInvalidatedAccordingToAST || !LICMN2Theshold) - return isInvalidatedAccordingToAST; - - // Check with a diagnostic analysis if we can refine the information above. - // This is to identify the limitations of using the AST. - // The alias set mechanism used by LICM has a major weakness in that it - // combines all things which may alias into a single set *before* asking - // modref questions. As a result, a single readonly call within a loop will - // collapse all loads and stores into a single alias set and report - // invalidation if the loop contains any store. For example, readonly calls - // with deopt states have this form and create a general alias set with all - // loads and stores. In order to get any LICM in loops containing possible - // deopt states we need a more precise invalidation of checking the mod ref - // info of each instruction within the loop and LI. This has a complexity of - // O(N^2), so currently, it is used only as a diagnostic tool since the - // default value of LICMN2Threshold is zero. - - // Don't look at nested loops. - if (CurLoop->begin() != CurLoop->end()) - return true; - - int N = 0; - for (BasicBlock *BB : CurLoop->getBlocks()) - for (Instruction &I : *BB) { - if (N >= LICMN2Theshold) { - LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for " - << *(MemLoc.Ptr) << "\n"); - return true; - } - N++; - auto Res = AA->getModRefInfo(&I, MemLoc); - if (isModSet(Res)) { - LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for " - << *(MemLoc.Ptr) << "\n"); - return true; - } - } - LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n"); - return false; + return CurAST->getAliasSetFor(MemLoc).isMod(); } bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp index 993b154dc9a8..d438d56e38ca 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopBoundSplit.h" +#include "llvm/ADT/Sequence.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" @@ -39,10 +40,12 @@ struct ConditionInfo { ICmpInst::Predicate Pred; /// AddRec llvm value Value *AddRecValue; + /// Non PHI AddRec llvm value + Value *NonPHIAddRecValue; /// Bound llvm value Value *BoundValue; /// AddRec SCEV - const SCEV *AddRecSCEV; + const SCEVAddRecExpr *AddRecSCEV; /// Bound SCEV const SCEV *BoundSCEV; @@ -54,19 +57,31 @@ struct ConditionInfo { } // namespace static void analyzeICmp(ScalarEvolution &SE, ICmpInst *ICmp, - ConditionInfo &Cond) { + ConditionInfo &Cond, const Loop &L) { Cond.ICmp = ICmp; if (match(ICmp, m_ICmp(Cond.Pred, m_Value(Cond.AddRecValue), m_Value(Cond.BoundValue)))) { - Cond.AddRecSCEV = SE.getSCEV(Cond.AddRecValue); - Cond.BoundSCEV = SE.getSCEV(Cond.BoundValue); + const SCEV *AddRecSCEV = SE.getSCEV(Cond.AddRecValue); + const SCEV *BoundSCEV = SE.getSCEV(Cond.BoundValue); + const SCEVAddRecExpr *LHSAddRecSCEV = dyn_cast<SCEVAddRecExpr>(AddRecSCEV); + const SCEVAddRecExpr *RHSAddRecSCEV = dyn_cast<SCEVAddRecExpr>(BoundSCEV); // Locate AddRec in LHSSCEV and Bound in RHSSCEV. - if (isa<SCEVAddRecExpr>(Cond.BoundSCEV) && - !isa<SCEVAddRecExpr>(Cond.AddRecSCEV)) { + if (!LHSAddRecSCEV && RHSAddRecSCEV) { std::swap(Cond.AddRecValue, Cond.BoundValue); - std::swap(Cond.AddRecSCEV, Cond.BoundSCEV); + std::swap(AddRecSCEV, BoundSCEV); Cond.Pred = ICmpInst::getSwappedPredicate(Cond.Pred); } + + Cond.AddRecSCEV = dyn_cast<SCEVAddRecExpr>(AddRecSCEV); + Cond.BoundSCEV = BoundSCEV; + Cond.NonPHIAddRecValue = Cond.AddRecValue; + + // If the Cond.AddRecValue is PHI node, update Cond.NonPHIAddRecValue with + // value from backedge. + if (Cond.AddRecSCEV && isa<PHINode>(Cond.AddRecValue)) { + PHINode *PN = cast<PHINode>(Cond.AddRecValue); + Cond.NonPHIAddRecValue = PN->getIncomingValueForBlock(L.getLoopLatch()); + } } } @@ -118,21 +133,20 @@ static bool calculateUpperBound(const Loop &L, ScalarEvolution &SE, static bool hasProcessableCondition(const Loop &L, ScalarEvolution &SE, ICmpInst *ICmp, ConditionInfo &Cond, bool IsExitCond) { - analyzeICmp(SE, ICmp, Cond); + analyzeICmp(SE, ICmp, Cond, L); // The BoundSCEV should be evaluated at loop entry. if (!SE.isAvailableAtLoopEntry(Cond.BoundSCEV, &L)) return false; - const SCEVAddRecExpr *AddRecSCEV = dyn_cast<SCEVAddRecExpr>(Cond.AddRecSCEV); // Allowed AddRec as induction variable. - if (!AddRecSCEV) + if (!Cond.AddRecSCEV) return false; - if (!AddRecSCEV->isAffine()) + if (!Cond.AddRecSCEV->isAffine()) return false; - const SCEV *StepRecSCEV = AddRecSCEV->getStepRecurrence(SE); + const SCEV *StepRecSCEV = Cond.AddRecSCEV->getStepRecurrence(SE); // Allowed constant step. if (!isa<SCEVConstant>(StepRecSCEV)) return false; @@ -264,6 +278,14 @@ static BranchInst *findSplitCandidate(const Loop &L, ScalarEvolution &SE, SplitCandidateCond.BoundSCEV->getType()) continue; + // After transformation, we assume the split condition of the pre-loop is + // always true. In order to guarantee it, we need to check the start value + // of the split cond AddRec satisfies the split condition. + if (!SE.isLoopEntryGuardedByCond(&L, SplitCandidateCond.Pred, + SplitCandidateCond.AddRecSCEV->getStart(), + SplitCandidateCond.BoundSCEV)) + continue; + SplitCandidateCond.BI = BI; return BI; } @@ -341,13 +363,45 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, ".split", &LI, &DT, PostLoopBlocks); remapInstructionsInBlocks(PostLoopBlocks, VMap); - // Add conditional branch to check we can skip post-loop in its preheader. BasicBlock *PostLoopPreHeader = PostLoop->getLoopPreheader(); - IRBuilder<> Builder(PostLoopPreHeader); + IRBuilder<> Builder(&PostLoopPreHeader->front()); + + // Update phi nodes in header of post-loop. + bool isExitingLatch = + (L.getExitingBlock() == L.getLoopLatch()) ? true : false; + Value *ExitingCondLCSSAPhi = nullptr; + for (PHINode &PN : L.getHeader()->phis()) { + // Create LCSSA phi node in preheader of post-loop. + PHINode *LCSSAPhi = + Builder.CreatePHI(PN.getType(), 1, PN.getName() + ".lcssa"); + LCSSAPhi->setDebugLoc(PN.getDebugLoc()); + // If the exiting block is loop latch, the phi does not have the update at + // last iteration. In this case, update lcssa phi with value from backedge. + LCSSAPhi->addIncoming( + isExitingLatch ? PN.getIncomingValueForBlock(L.getLoopLatch()) : &PN, + L.getExitingBlock()); + + // Update the start value of phi node in post-loop with the LCSSA phi node. + PHINode *PostLoopPN = cast<PHINode>(VMap[&PN]); + PostLoopPN->setIncomingValueForBlock(PostLoopPreHeader, LCSSAPhi); + + // Find PHI with exiting condition from pre-loop. The PHI should be + // SCEVAddRecExpr and have same incoming value from backedge with + // ExitingCond. + if (!SE.isSCEVable(PN.getType())) + continue; + + const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN)); + if (PhiSCEV && ExitingCond.NonPHIAddRecValue == + PN.getIncomingValueForBlock(L.getLoopLatch())) + ExitingCondLCSSAPhi = LCSSAPhi; + } + + // Add conditional branch to check we can skip post-loop in its preheader. Instruction *OrigBI = PostLoopPreHeader->getTerminator(); ICmpInst::Predicate Pred = ICmpInst::ICMP_NE; Value *Cond = - Builder.CreateICmp(Pred, ExitingCond.AddRecValue, ExitingCond.BoundValue); + Builder.CreateICmp(Pred, ExitingCondLCSSAPhi, ExitingCond.BoundValue); Builder.CreateCondBr(Cond, PostLoop->getHeader(), PostLoop->getExitBlock()); OrigBI->eraseFromParent(); @@ -368,21 +422,6 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, // Replace exiting bound value of pre-loop NewBound. ExitingCond.ICmp->setOperand(1, NewBoundValue); - // Replace IV's start value of post-loop by NewBound. - for (PHINode &PN : L.getHeader()->phis()) { - // Find PHI with exiting condition from pre-loop. - if (SE.isSCEVable(PN.getType()) && isa<SCEVAddRecExpr>(SE.getSCEV(&PN))) { - for (Value *Op : PN.incoming_values()) { - if (Op == ExitingCond.AddRecValue) { - // Find cloned PHI for post-loop. - PHINode *PostLoopPN = cast<PHINode>(VMap[&PN]); - PostLoopPN->setIncomingValueForBlock(PostLoopPreHeader, - NewBoundValue); - } - } - } - } - // Replace SplitCandidateCond.BI's condition of pre-loop by True. LLVMContext &Context = PreHeader->getContext(); SplitCandidateCond.BI->setCondition(ConstantInt::getTrue(Context)); @@ -398,6 +437,30 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, else ExitingCond.BI->setSuccessor(1, PostLoopPreHeader); + // Update phi node in exit block of post-loop. + Builder.SetInsertPoint(&PostLoopPreHeader->front()); + for (PHINode &PN : PostLoop->getExitBlock()->phis()) { + for (auto i : seq<int>(0, PN.getNumOperands())) { + // Check incoming block is pre-loop's exiting block. + if (PN.getIncomingBlock(i) == L.getExitingBlock()) { + Value *IncomingValue = PN.getIncomingValue(i); + + // Create LCSSA phi node for incoming value. + PHINode *LCSSAPhi = + Builder.CreatePHI(PN.getType(), 1, PN.getName() + ".lcssa"); + LCSSAPhi->setDebugLoc(PN.getDebugLoc()); + LCSSAPhi->addIncoming(IncomingValue, PN.getIncomingBlock(i)); + + // Replace pre-loop's exiting block by post-loop's preheader. + PN.setIncomingBlock(i, PostLoopPreHeader); + // Replace incoming value by LCSSAPhi. + PN.setIncomingValue(i, LCSSAPhi); + // Add a new incoming value with post-loop's exiting block. + PN.addIncoming(VMap[IncomingValue], PostLoop->getExitingBlock()); + } + } + } + // Update dominator tree. DT.changeImmediateDominator(PostLoopPreHeader, L.getExitingBlock()); DT.changeImmediateDominator(PostLoop->getExitBlock(), PostLoopPreHeader); @@ -406,10 +469,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, SE.forgetLoop(&L); // Canonicalize loops. - // TODO: Try to update LCSSA information according to above change. - formLCSSA(L, DT, &LI, &SE); simplifyLoop(&L, &DT, &LI, &SE, nullptr, nullptr, true); - formLCSSA(*PostLoop, DT, &LI, &SE); simplifyLoop(PostLoop, &DT, &LI, &SE, nullptr, nullptr, true); // Add new post-loop to loop pass manager. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index a5d7835bd094..77d76609c926 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -127,6 +128,8 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addPreserved<ScalarEvolutionWrapperPass>(); @@ -143,6 +146,7 @@ INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch", diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index f7e8442fae81..5814e2f043d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -36,6 +36,8 @@ using namespace llvm; #define DEBUG_TYPE "loop-delete" STATISTIC(NumDeleted, "Number of loops deleted"); +STATISTIC(NumBackedgesBroken, + "Number of loops for which we managed to break the backedge"); static cl::opt<bool> EnableSymbolicExecution( "loop-deletion-enable-symbolic-execution", cl::Hidden, cl::init(true), @@ -191,6 +193,20 @@ getValueOnFirstIteration(Value *V, DenseMap<Value *, Value *> &FirstIterValue, Value *RHS = getValueOnFirstIteration(BO->getOperand(1), FirstIterValue, SQ); FirstIterV = SimplifyBinOp(BO->getOpcode(), LHS, RHS, SQ); + } else if (auto *Cmp = dyn_cast<ICmpInst>(V)) { + Value *LHS = + getValueOnFirstIteration(Cmp->getOperand(0), FirstIterValue, SQ); + Value *RHS = + getValueOnFirstIteration(Cmp->getOperand(1), FirstIterValue, SQ); + FirstIterV = SimplifyICmpInst(Cmp->getPredicate(), LHS, RHS, SQ); + } else if (auto *Select = dyn_cast<SelectInst>(V)) { + Value *Cond = + getValueOnFirstIteration(Select->getCondition(), FirstIterValue, SQ); + if (auto *C = dyn_cast<ConstantInt>(Cond)) { + auto *Selected = C->isAllOnesValue() ? Select->getTrueValue() + : Select->getFalseValue(); + FirstIterV = getValueOnFirstIteration(Selected, FirstIterValue, SQ); + } } if (!FirstIterV) FirstIterV = V; @@ -314,22 +330,20 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT, } using namespace PatternMatch; - ICmpInst::Predicate Pred; - Value *LHS, *RHS; + Value *Cond; BasicBlock *IfTrue, *IfFalse; auto *Term = BB->getTerminator(); - if (match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), + if (match(Term, m_Br(m_Value(Cond), m_BasicBlock(IfTrue), m_BasicBlock(IfFalse)))) { - if (!LHS->getType()->isIntegerTy()) { + auto *ICmp = dyn_cast<ICmpInst>(Cond); + if (!ICmp || !ICmp->getType()->isIntegerTy()) { MarkAllSuccessorsLive(BB); continue; } // Can we prove constant true or false for this condition? - LHS = getValueOnFirstIteration(LHS, FirstIterValue, SQ); - RHS = getValueOnFirstIteration(RHS, FirstIterValue, SQ); - auto *KnownCondition = SimplifyICmpInst(Pred, LHS, RHS, SQ); - if (!KnownCondition) { + auto *KnownCondition = getValueOnFirstIteration(ICmp, FirstIterValue, SQ); + if (KnownCondition == ICmp) { // Failed to simplify. MarkAllSuccessorsLive(BB); continue; @@ -393,14 +407,25 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE, if (!L->getLoopLatch()) return LoopDeletionResult::Unmodified; - auto *BTC = SE.getBackedgeTakenCount(L); - if (!isa<SCEVCouldNotCompute>(BTC) && SE.isKnownNonZero(BTC)) - return LoopDeletionResult::Unmodified; - if (!BTC->isZero() && !canProveExitOnFirstIteration(L, DT, LI)) - return LoopDeletionResult::Unmodified; + auto *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); + if (BTC->isZero()) { + // SCEV knows this backedge isn't taken! + breakLoopBackedge(L, DT, SE, LI, MSSA); + ++NumBackedgesBroken; + return LoopDeletionResult::Deleted; + } - breakLoopBackedge(L, DT, SE, LI, MSSA); - return LoopDeletionResult::Deleted; + // If SCEV leaves open the possibility of a zero trip count, see if + // symbolically evaluating the first iteration lets us prove the backedge + // unreachable. + if (isa<SCEVCouldNotCompute>(BTC) || !SE.isKnownNonZero(BTC)) + if (canProveExitOnFirstIteration(L, DT, LI)) { + breakLoopBackedge(L, DT, SE, LI, MSSA); + ++NumBackedgesBroken; + return LoopDeletionResult::Deleted; + } + + return LoopDeletionResult::Unmodified; } /// Remove a loop if it is dead. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index bac3dc0f3fb9..0f4c767c1e4c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -1057,8 +1057,8 @@ PreservedAnalyses LoopDistributePass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, nullptr}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, nullptr, nullptr}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index f54289f85ef5..965d1575518e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -27,6 +27,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopFlatten.h" + +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -49,11 +51,13 @@ #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" -#define DEBUG_TYPE "loop-flatten" - using namespace llvm; using namespace llvm::PatternMatch; +#define DEBUG_TYPE "loop-flatten" + +STATISTIC(NumFlattened, "Number of loops flattened"); + static cl::opt<unsigned> RepeatedInstructionThreshold( "loop-flatten-cost-threshold", cl::Hidden, cl::init(2), cl::desc("Limit on the cost of instructions that can be repeated due to " @@ -90,9 +94,33 @@ struct FlattenInfo { // Whether this holds the flatten info before or after widening. bool Widened = false; + // Holds the old/narrow induction phis, i.e. the Phis before IV widening has + // been applied. This bookkeeping is used so we can skip some checks on these + // phi nodes. + PHINode *NarrowInnerInductionPHI = nullptr; + PHINode *NarrowOuterInductionPHI = nullptr; + FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {}; + + bool isNarrowInductionPhi(PHINode *Phi) { + // This can't be the narrow phi if we haven't widened the IV first. + if (!Widened) + return false; + return NarrowInnerInductionPHI == Phi || NarrowOuterInductionPHI == Phi; + } }; +static bool +setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment, + SmallPtrSetImpl<Instruction *> &IterationInstructions) { + TripCount = TC; + IterationInstructions.insert(Increment); + LLVM_DEBUG(dbgs() << "Found Increment: "; Increment->dump()); + LLVM_DEBUG(dbgs() << "Found trip count: "; TripCount->dump()); + LLVM_DEBUG(dbgs() << "Successfully found all loop components\n"); + return true; +} + // Finds the induction variable, increment and trip count for a simple loop that // we can flatten. static bool findLoopComponents( @@ -164,36 +192,68 @@ static bool findLoopComponents( return false; } // The trip count is the RHS of the compare. If this doesn't match the trip - // count computed by SCEV then this is either because the trip count variable - // has been widened (then leave the trip count as it is), or because it is a - // constant and another transformation has changed the compare, e.g. - // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, then we don't flatten - // the loop (yet). - TripCount = Compare->getOperand(1); + // count computed by SCEV then this is because the trip count variable + // has been widened so the types don't match, or because it is a constant and + // another transformation has changed the compare (e.g. icmp ult %inc, + // tripcount -> icmp ult %j, tripcount-1), or both. + Value *RHS = Compare->getOperand(1); + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n"); + return false; + } + // The use of the Extend=false flag on getTripCountFromExitCount was added + // during a refactoring to preserve existing behavior. However, there's + // nothing obvious in the surrounding code when handles the overflow case. + // FIXME: audit code to establish whether there's a latent bug here. const SCEV *SCEVTripCount = - SE->getTripCountFromExitCount(SE->getBackedgeTakenCount(L)); - if (SE->getSCEV(TripCount) != SCEVTripCount) { - if (!IsWidened) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - auto TripCountInst = dyn_cast<Instruction>(TripCount); - if (!TripCountInst) { - LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); - return false; + SE->getTripCountFromExitCount(BackedgeTakenCount, false); + const SCEV *SCEVRHS = SE->getSCEV(RHS); + if (SCEVRHS == SCEVTripCount) + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS); + if (ConstantRHS) { + const SCEV *BackedgeTCExt = nullptr; + if (IsWidened) { + const SCEV *SCEVTripCountExt; + // Find the extended backedge taken count and extended trip count using + // SCEV. One of these should now match the RHS of the compare. + BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType()); + SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false); + if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } } - if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) || - SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) { - LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); - return false; + // If the RHS of the compare is equal to the backedge taken count we need + // to add one to get the trip count. + if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) { + ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1); + Value *NewRHS = ConstantInt::get( + ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue()); + return setLoopComponents(NewRHS, TripCount, Increment, + IterationInstructions); } + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); } - IterationInstructions.insert(Increment); - LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump()); - LLVM_DEBUG(dbgs() << "Found trip count: "; TripCount->dump()); - - LLVM_DEBUG(dbgs() << "Successfully found all loop components\n"); - return true; + // If the RHS isn't a constant then check that the reason it doesn't match + // the SCEV trip count is because the RHS is a ZExt or SExt instruction + // (and take the trip count to be the RHS). + if (!IsWidened) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + auto *TripCountInst = dyn_cast<Instruction>(RHS); + if (!TripCountInst) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) || + SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) { + LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); + return false; + } + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); } static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) { @@ -221,6 +281,8 @@ static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) { // them specially when doing the transformation. if (&InnerPHI == FI.InnerInductionPHI) continue; + if (FI.isNarrowInductionPhi(&InnerPHI)) + continue; // Each inner loop PHI node must have two incoming values/blocks - one // from the pre-header, and one from the latch. @@ -266,6 +328,8 @@ static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) { } for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) { + if (FI.isNarrowInductionPhi(&OuterPHI)) + continue; if (!SafeOuterPHIs.count(&OuterPHI)) { LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump()); return false; @@ -356,18 +420,25 @@ static bool checkIVUsers(FlattenInfo &FI) { if (U == FI.InnerIncrement) continue; - // After widening the IVs, a trunc instruction might have been introduced, so - // look through truncs. + // After widening the IVs, a trunc instruction might have been introduced, + // so look through truncs. if (isa<TruncInst>(U)) { if (!U->hasOneUse()) return false; U = *U->user_begin(); } + // If the use is in the compare (which is also the condition of the inner + // branch) then the compare has been altered by another transformation e.g + // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is + // a constant. Ignore this use as the compare gets removed later anyway. + if (U == FI.InnerBranch->getCondition()) + continue; + LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump()); - Value *MatchedMul; - Value *MatchedItCount; + Value *MatchedMul = nullptr; + Value *MatchedItCount = nullptr; bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI), m_Value(MatchedMul))) && match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI), @@ -375,11 +446,23 @@ static bool checkIVUsers(FlattenInfo &FI) { // Matches the same pattern as above, except it also looks for truncs // on the phi, which can be the result of widening the induction variables. - bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)), - m_Value(MatchedMul))) && - match(MatchedMul, - m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)), - m_Value(MatchedItCount))); + bool IsAddTrunc = + match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)), + m_Value(MatchedMul))) && + match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)), + m_Value(MatchedItCount))); + + if (!MatchedItCount) + return false; + // Look through extends if the IV has been widened. + if (FI.Widened && + (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) { + assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() && + "Unexpected type mismatch in types after widening"); + MatchedItCount = isa<SExtInst>(MatchedItCount) + ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0) + : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0); + } if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) { LLVM_DEBUG(dbgs() << "Use is optimisable\n"); @@ -451,17 +534,27 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT, for (Value *V : FI.LinearIVUses) { for (Value *U : V->users()) { if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { - // The IV is used as the operand of a GEP, and the IV is at least as - // wide as the address space of the GEP. In this case, the GEP would - // wrap around the address space before the IV increment wraps, which - // would be UB. - if (GEP->isInBounds() && - V->getType()->getIntegerBitWidth() >= - DL.getPointerTypeSizeInBits(GEP->getType())) { - LLVM_DEBUG( - dbgs() << "use of linear IV would be UB if overflow occurred: "; - GEP->dump()); - return OverflowResult::NeverOverflows; + for (Value *GEPUser : U->users()) { + Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser); + if (!isa<LoadInst>(GEPUserInst) && + !(isa<StoreInst>(GEPUserInst) && + GEP == GEPUserInst->getOperand(1))) + continue; + if (!isGuaranteedToExecuteForEveryIteration(GEPUserInst, + FI.InnerLoop)) + continue; + // The IV is used as the operand of a GEP which dominates the loop + // latch, and the IV is at least as wide as the address space of the + // GEP. In this case, the GEP would wrap around the address space + // before the IV increment wraps, which would be UB. + if (GEP->isInBounds() && + V->getType()->getIntegerBitWidth() >= + DL.getPointerTypeSizeInBits(GEP->getType())) { + LLVM_DEBUG( + dbgs() << "use of linear IV would be UB if overflow occurred: "; + GEP->dump()); + return OverflowResult::NeverOverflows; + } } } } @@ -518,7 +611,7 @@ static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, LPMUpdater *U) { Function *F = FI.OuterLoop->getHeader()->getParent(); LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n"); { @@ -574,7 +667,13 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // deleted, and any information that have about the outer loop invalidated. SE->forgetLoop(FI.OuterLoop); SE->forgetLoop(FI.InnerLoop); + if (U) + U->markLoopAsDeleted(*FI.InnerLoop, FI.InnerLoop->getName()); LI->erase(FI.InnerLoop); + + // Increment statistic value. + NumFlattened++; + return true; } @@ -605,14 +704,11 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, } SCEVExpander Rewriter(*SE, DL, "loopflatten"); - SmallVector<WideIVInfo, 2> WideIVs; SmallVector<WeakTrackingVH, 4> DeadInsts; - WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false }); - WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false }); unsigned ElimExt = 0; unsigned Widened = 0; - for (const auto &WideIV : WideIVs) { + auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool { PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, ElimExt, Widened, true /* HasGuards */, true /* UsePostIncrementRanges */); @@ -620,17 +716,35 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump()); LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIV.NarrowIV->dump()); - RecursivelyDeleteDeadPHINode(WideIV.NarrowIV); - } - // After widening, rediscover all the loop components. + Deleted = RecursivelyDeleteDeadPHINode(WideIV.NarrowIV); + return true; + }; + + bool Deleted; + if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted)) + return false; + // Add the narrow phi to list, so that it will be adjusted later when the + // the transformation is performed. + if (!Deleted) + FI.InnerPHIsToTransform.insert(FI.InnerInductionPHI); + + if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted)) + return false; + assert(Widened && "Widened IV expected"); FI.Widened = true; + + // Save the old/narrow induction phis, which we need to ignore in CheckPHIs. + FI.NarrowInnerInductionPHI = FI.InnerInductionPHI; + FI.NarrowOuterInductionPHI = FI.OuterInductionPHI; + + // After widening, rediscover all the loop components. return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI); } static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, LPMUpdater *U) { LLVM_DEBUG( dbgs() << "Loop flattening running on outer loop " << FI.OuterLoop->getHeader()->getName() << " and inner loop " @@ -641,12 +755,30 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; // Check if we can widen the induction variables to avoid overflow checks. - if (CanWidenIV(FI, DT, LI, SE, AC, TTI)) - return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI); - - // Check if the new iteration variable might overflow. In this case, we - // need to version the loop, and select the original version at runtime if - // the iteration space is too large. + bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI); + + // It can happen that after widening of the IV, flattening may not be + // possible/happening, e.g. when it is deemed unprofitable. So bail here if + // that is the case. + // TODO: IV widening without performing the actual flattening transformation + // is not ideal. While this codegen change should not matter much, it is an + // unnecessary change which is better to avoid. It's unlikely this happens + // often, because if it's unprofitibale after widening, it should be + // unprofitabe before widening as checked in the first round of checks. But + // 'RepeatedInstructionThreshold' is set to only 2, which can probably be + // relaxed. Because this is making a code change (the IV widening, but not + // the flattening), we return true here. + if (FI.Widened && !CanFlatten) + return true; + + // If we have widened and can perform the transformation, do that here. + if (CanFlatten) + return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + + // Otherwise, if we haven't widened the IV, check if the new iteration + // variable might overflow. In this case, we need to version the loop, and + // select the original version at runtime if the iteration space is too + // large. // TODO: We currently don't version the loop. OverflowResult OR = checkOverflow(FI, DT, AC); if (OR == OverflowResult::AlwaysOverflowsHigh || @@ -659,18 +791,18 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, } LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); - return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI); + return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); } bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, - AssumptionCache *AC, TargetTransformInfo *TTI) { + AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U) { bool Changed = false; for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) continue; FlattenInfo FI(OuterLoop, InnerLoop); - Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI); + Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); } return Changed; } @@ -685,12 +817,12 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. - Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI); + Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U); if (!Changed) return PreservedAnalyses::all(); - return PreservedAnalyses::none(); + return getLoopPassPreservedAnalyses(); } namespace { @@ -735,7 +867,7 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) { bool Changed = false; for (Loop *L : *LI) { auto LN = LoopNest::getLoopNest(*L, *SE); - Changed |= Flatten(*LN, DT, LI, SE, AC, TTI); + Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr); } return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a153f393448c..42da86a9ecf5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -217,15 +217,15 @@ private: bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, - bool NegStride, bool IsLoopMemset = false); + bool IsNegStride, bool IsLoopMemset = false); bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr, - unsigned StoreSize, MaybeAlign StoreAlign, + const SCEV *StoreSize, MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, const SCEVAddRecExpr *StoreEv, @@ -625,8 +625,8 @@ bool LoopIdiomRecognize::runOnLoopBlock( // We can only promote stores in this block if they are unconditionally // executed in the loop. For a block to be unconditionally executed, it has // to dominate all the exit blocks of the loop. Verify this now. - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (!DT->dominates(BB, ExitBlocks[i])) + for (BasicBlock *ExitBlock : ExitBlocks) + if (!DT->dominates(BB, ExitBlock)) return false; bool MadeChange = false; @@ -750,16 +750,13 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, bool Changed = false; // For stores that start but don't end a link in the chain: - for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) + for (StoreInst *I : Heads) { + if (Tails.count(I)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to transform it. SmallPtrSet<Instruction *, 8> AdjacentStores; - StoreInst *I = *it; - StoreInst *HeadStore = I; unsigned StoreSize = 0; @@ -784,12 +781,14 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, if (StoreSize != Stride && StoreSize != -Stride) continue; - bool NegStride = StoreSize == -Stride; + bool IsNegStride = StoreSize == -Stride; - if (processLoopStridedStore(StorePtr, StoreSize, + Type *IntIdxTy = DL->getIndexType(StorePtr->getType()); + const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize); + if (processLoopStridedStore(StorePtr, StoreSizeSCEV, MaybeAlign(HeadStore->getAlignment()), StoredVal, HeadStore, AdjacentStores, StoreEv, - BECount, NegStride)) { + BECount, IsNegStride)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); Changed = true; } @@ -857,15 +856,15 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, // Check if the stride matches the size of the memcpy. If so, then we know // that every byte is touched in the loop. - const SCEVConstant *StoreStride = + const SCEVConstant *ConstStoreStride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - const SCEVConstant *LoadStride = + const SCEVConstant *ConstLoadStride = dyn_cast<SCEVConstant>(LoadEv->getOperand(1)); - if (!StoreStride || !LoadStride) + if (!ConstStoreStride || !ConstLoadStride) return false; - APInt StoreStrideValue = StoreStride->getAPInt(); - APInt LoadStrideValue = LoadStride->getAPInt(); + APInt StoreStrideValue = ConstStoreStride->getAPInt(); + APInt LoadStrideValue = ConstLoadStride->getAPInt(); // Huge stride value - give up if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64) return false; @@ -875,7 +874,7 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI) << ore::NV("Inst", "memcpy") << " in " << ore::NV("Function", MCI->getFunction()) - << " function will not be hoised: " + << " function will not be hoisted: " << ore::NV("Reason", "memcpy size is not equal to stride"); }); return false; @@ -887,16 +886,17 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, if (StoreStrideInt != LoadStrideInt) return false; - return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes, - MCI->getDestAlign(), MCI->getSourceAlign(), - MCI, MCI, StoreEv, LoadEv, BECount); + return processLoopStoreOfLoopLoad( + Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes), + MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv, + BECount); } /// processLoopMemSet - See if this memset can be promoted to a large memset. bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { - // We can only handle non-volatile memsets with a constant size. - if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) + // We can only handle non-volatile memsets. + if (MSI->isVolatile()) return false; // If we're not allowed to hack on memset, we fail. @@ -909,23 +909,72 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, // loop, which indicates a strided store. If we have something else, it's a // random store we can't handle. const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); - if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) + if (!Ev || Ev->getLoop() != CurLoop) return false; - - // Reject memsets that are so large that they overflow an unsigned. - uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); - if ((SizeInBytes >> 32) != 0) + if (!Ev->isAffine()) { + LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n"); return false; + } - // Check to see if the stride matches the size of the memset. If so, then we - // know that every byte is touched in the loop. - const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); - if (!ConstStride) + const SCEV *PointerStrideSCEV = Ev->getOperand(1); + const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength()); + if (!PointerStrideSCEV || !MemsetSizeSCEV) return false; - APInt Stride = ConstStride->getAPInt(); - if (SizeInBytes != Stride && SizeInBytes != -Stride) - return false; + bool IsNegStride = false; + const bool IsConstantSize = isa<ConstantInt>(MSI->getLength()); + + if (IsConstantSize) { + // Memset size is constant. + // Check if the pointer stride matches the memset size. If so, then + // we know that every byte is touched in the loop. + LLVM_DEBUG(dbgs() << " memset size is constant\n"); + uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + if (!ConstStride) + return false; + + APInt Stride = ConstStride->getAPInt(); + if (SizeInBytes != Stride && SizeInBytes != -Stride) + return false; + + IsNegStride = SizeInBytes == -Stride; + } else { + // Memset size is non-constant. + // Check if the pointer stride matches the memset size. + // To be conservative, the pass would not promote pointers that aren't in + // address space zero. Also, the pass only handles memset length and stride + // that are invariant for the top level loop. + LLVM_DEBUG(dbgs() << " memset size is non-constant\n"); + if (Pointer->getType()->getPointerAddressSpace() != 0) { + LLVM_DEBUG(dbgs() << " pointer is not in address space zero, " + << "abort\n"); + return false; + } + if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) { + LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, " + << "abort\n"); + return false; + } + + // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV + IsNegStride = PointerStrideSCEV->isNonConstantNegative(); + const SCEV *PositiveStrideSCEV = + IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV) + : PointerStrideSCEV; + LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n" + << " PositiveStrideSCEV: " << *PositiveStrideSCEV + << "\n"); + + if (PositiveStrideSCEV != MemsetSizeSCEV) { + // TODO: folding can be done to the SCEVs + // The folding is to fold expressions that is covered by the loop guard + // at loop entry. After the folding, compare again and proceed + // optimization if equal. + LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); + return false; + } + } // Verify that the memset value is loop invariant. If not, we can't promote // the memset. @@ -935,10 +984,10 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, SmallPtrSet<Instruction *, 1> MSIs; MSIs.insert(MSI); - bool NegStride = SizeInBytes == -Stride; - return processLoopStridedStore( - Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()), - SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true); + return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()), + MaybeAlign(MSI->getDestAlignment()), + SplatValue, MSI, MSIs, Ev, BECount, + IsNegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -946,9 +995,9 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, /// argument specifies what the verboten forms of access are (read or write). static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, - const SCEV *BECount, unsigned StoreSize, + const SCEV *BECount, const SCEV *StoreSizeSCEV, AliasAnalysis &AA, - SmallPtrSetImpl<Instruction *> &IgnoredStores) { + SmallPtrSetImpl<Instruction *> &IgnoredInsts) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts // at the pointer and has infinite size. @@ -956,9 +1005,11 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, // If the loop iterates a fixed number of times, we can refine the access size // to be exactly the size of the memset, which is (BECount+1)*StoreSize - if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount); + const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); + if (BECst && ConstSize) AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) * - StoreSize); + ConstSize->getValue()->getZExtValue()); // TODO: For this to be really effective, we have to dive into the pointer // operand in the store. Store to &A[i] of 100 will always return may alias @@ -966,14 +1017,12 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, // which will then no-alias a store to &A[100]. MemoryLocation StoreLoc(Ptr, AccessSize); - for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; - ++BI) - for (Instruction &I : **BI) - if (IgnoredStores.count(&I) == 0 && + for (BasicBlock *B : L->blocks()) + for (Instruction &I : *B) + if (!IgnoredInsts.contains(&I) && isModOrRefSet( intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access))) return true; - return false; } @@ -981,57 +1030,67 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, // we're trying to memset. Therefore, we need to recompute the base pointer, // which is just Start - BECount*Size. static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, - Type *IntPtr, unsigned StoreSize, + Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE) { const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); - if (StoreSize != 1) - Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), + if (!StoreSizeSCEV->isOne()) { + // index = back edge count * store size + Index = SE->getMulExpr(Index, + SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr), SCEV::FlagNUW); + } + // base pointer = start - index * store size return SE->getMinusSCEV(Start, Index); } -/// Compute the number of bytes as a SCEV from the backedge taken count. -/// -/// This also maps the SCEV into the provided type and tries to handle the -/// computation in a way that will fold cleanly. -static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, - unsigned StoreSize, Loop *CurLoop, - const DataLayout *DL, ScalarEvolution *SE) { - const SCEV *NumBytesS; - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to +/// Compute trip count from the backedge taken count. +static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr, + Loop *CurLoop, const DataLayout *DL, + ScalarEvolution *SE) { + const SCEV *TripCountS = nullptr; + // The # stored bytes is (BECount+1). Expand the trip count out to // pointer size if it isn't already. // // If we're going to need to zero extend the BE count, check if we can add // one to it prior to zero extending without overflow. Provided this is safe, // it allows better simplification of the +1. - if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() < - DL->getTypeSizeInBits(IntPtr).getFixedSize() && + if (DL->getTypeSizeInBits(BECount->getType()) < + DL->getTypeSizeInBits(IntPtr) && SE->isLoopEntryGuardedByCond( CurLoop, ICmpInst::ICMP_NE, BECount, SE->getNegativeSCEV(SE->getOne(BECount->getType())))) { - NumBytesS = SE->getZeroExtendExpr( + TripCountS = SE->getZeroExtendExpr( SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW), IntPtr); } else { - NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr), - SE->getOne(IntPtr), SCEV::FlagNUW); + TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr), + SE->getOne(IntPtr), SCEV::FlagNUW); } - // And scale it based on the store size. - if (StoreSize != 1) { - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), - SCEV::FlagNUW); - } - return NumBytesS; + return TripCountS; +} + +/// Compute the number of bytes as a SCEV from the backedge taken count. +/// +/// This also maps the SCEV into the provided type and tries to handle the +/// computation in a way that will fold cleanly. +static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, + const SCEV *StoreSizeSCEV, Loop *CurLoop, + const DataLayout *DL, ScalarEvolution *SE) { + const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE); + + return SE->getMulExpr(TripCountSCEV, + SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr), + SCEV::FlagNUW); } /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( - Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment, + Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride, bool IsLoopMemset) { + const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) { Value *SplatValue = isBytewiseValue(StoredVal, *DL); Constant *PatternValue = nullptr; @@ -1056,8 +1115,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( bool Changed = false; const SCEV *Start = Ev->getStart(); // Handle negative strided loops. - if (NegStride) - Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE); + if (IsNegStride) + Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -1082,7 +1141,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Changed = true; if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) + StoreSizeSCEV, *AA, Stores)) return Changed; if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) @@ -1091,7 +1150,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, everything looks good, insert the memset. const SCEV *NumBytesS = - getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -1138,13 +1197,20 @@ bool LoopIdiomRecognize::processLoopStridedStore( << "\n"); ORE.emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore", - NewCall->getDebugLoc(), Preheader) - << "Transformed loop-strided store in " - << ore::NV("Function", TheStore->getFunction()) - << " function into a call to " - << ore::NV("NewFunction", NewCall->getCalledFunction()) - << "() intrinsic"; + OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore", + NewCall->getDebugLoc(), Preheader); + R << "Transformed loop-strided store in " + << ore::NV("Function", TheStore->getFunction()) + << " function into a call to " + << ore::NV("NewFunction", NewCall->getCalledFunction()) + << "() intrinsic"; + if (!Stores.empty()) + R << ore::setExtraArgs(); + for (auto *I : Stores) { + R << ore::NV("FromBlock", I->getParent()->getName()) + << ore::NV("ToBlock", Preheader->getName()); + } + return R; }); // Okay, the memset has been formed. Zap the original store and anything that @@ -1181,16 +1247,63 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // random load we can't handle. Value *LoadPtr = LI->getPointerOperand(); const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr)); - return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize, + + const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize); + return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV, SI->getAlign(), LI->getAlign(), SI, LI, StoreEv, LoadEv, BECount); } +class MemmoveVerifier { +public: + explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr, + const DataLayout &DL) + : DL(DL), LoadOff(0), StoreOff(0), + BP1(llvm::GetPointerBaseWithConstantOffset( + LoadBasePtr.stripPointerCasts(), LoadOff, DL)), + BP2(llvm::GetPointerBaseWithConstantOffset( + StoreBasePtr.stripPointerCasts(), StoreOff, DL)), + IsSameObject(BP1 == BP2) {} + + bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride, + const Instruction &TheLoad, + bool IsMemCpy) const { + if (IsMemCpy) { + // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr + // for negative stride. + if ((!IsNegStride && LoadOff <= StoreOff) || + (IsNegStride && LoadOff >= StoreOff)) + return false; + } else { + // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr + // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr. + int64_t LoadSize = + DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8; + if (BP1 != BP2 || LoadSize != int64_t(StoreSize)) + return false; + if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) || + (IsNegStride && LoadOff + LoadSize > StoreOff)) + return false; + } + return true; + } + +private: + const DataLayout &DL; + int64_t LoadOff; + int64_t StoreOff; + const Value *BP1; + const Value *BP2; + +public: + const bool IsSameObject; +}; + bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( - Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign, - MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, - const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, - const SCEV *BECount) { + Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV, + MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore, + Instruction *TheLoad, const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, const SCEV *BECount) { // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to // conservatively bail here, since otherwise we may have to transform @@ -1213,11 +1326,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); APInt Stride = getStoreStride(StoreEv); - bool NegStride = StoreSize == -Stride; + const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); + + // TODO: Deal with non-constant size; Currently expect constant store size + assert(ConstStoreSize && "store size is expected to be a constant"); + + int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue(); + bool IsNegStride = StoreSize == -Stride; // Handle negative strided loops. - if (NegStride) - StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); + if (IsNegStride) + StrStart = + getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1237,24 +1357,24 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // the return value will read this comment, and leave them alone. Changed = true; - SmallPtrSet<Instruction *, 2> Stores; - Stores.insert(TheStore); + SmallPtrSet<Instruction *, 2> IgnoredInsts; + IgnoredInsts.insert(TheStore); bool IsMemCpy = isa<MemCpyInst>(TheStore); const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store"; - bool UseMemMove = + bool LoopAccessStore = mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, - StoreSize, *AA, Stores); - if (UseMemMove) { + StoreSizeSCEV, *AA, IgnoredInsts); + if (LoopAccessStore) { // For memmove case it's not enough to guarantee that loop doesn't access // TheStore and TheLoad. Additionally we need to make sure that TheStore is // the only user of TheLoad. if (!TheLoad->hasOneUse()) return Changed; - Stores.insert(TheLoad); + IgnoredInsts.insert(TheLoad); if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, - BECount, StoreSize, *AA, Stores)) { + BECount, StoreSizeSCEV, *AA, IgnoredInsts)) { ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore", TheStore) @@ -1265,15 +1385,16 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( }); return Changed; } - Stores.erase(TheLoad); + IgnoredInsts.erase(TheLoad); } const SCEV *LdStart = LoadEv->getStart(); unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace(); // Handle negative strided loops. - if (NegStride) - LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE); + if (IsNegStride) + LdStart = + getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE); // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. @@ -1283,42 +1404,40 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // If the store is a memcpy instruction, we must check if it will write to // the load memory locations. So remove it from the ignored stores. if (IsMemCpy) - Stores.erase(TheStore); + IgnoredInsts.erase(TheStore); + MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL); if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, - StoreSize, *AA, Stores)) { - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad) - << ore::NV("Inst", InstRemark) << " in " - << ore::NV("Function", TheStore->getFunction()) - << " function will not be hoisted: " - << ore::NV("Reason", "The loop may access load location"); - }); - return Changed; - } - if (UseMemMove) { - // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for - // negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr. - int64_t LoadOff = 0, StoreOff = 0; - const Value *BP1 = llvm::GetPointerBaseWithConstantOffset( - LoadBasePtr->stripPointerCasts(), LoadOff, *DL); - const Value *BP2 = llvm::GetPointerBaseWithConstantOffset( - StoreBasePtr->stripPointerCasts(), StoreOff, *DL); - int64_t LoadSize = - DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8; - if (BP1 != BP2 || LoadSize != int64_t(StoreSize)) + StoreSizeSCEV, *AA, IgnoredInsts)) { + if (!IsMemCpy) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", + TheLoad) + << ore::NV("Inst", InstRemark) << " in " + << ore::NV("Function", TheStore->getFunction()) + << " function will not be hoisted: " + << ore::NV("Reason", "The loop may access load location"); + }); return Changed; - if ((!NegStride && LoadOff < StoreOff + int64_t(StoreSize)) || - (NegStride && LoadOff + LoadSize > StoreOff)) + } + // At this point loop may access load only for memcpy in same underlying + // object. If that's not the case bail out. + if (!Verifier.IsSameObject) return Changed; } + bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore; + if (UseMemMove) + if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad, + IsMemCpy)) + return Changed; + if (avoidLIRForMultiBlockLoop()) return Changed; // Okay, everything is safe, we can transform this! const SCEV *NumBytesS = - getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); @@ -1380,11 +1499,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( << ore::NV("NewFunction", NewCall->getCalledFunction()) << "() intrinsic from " << ore::NV("Inst", InstRemark) << " instruction in " << ore::NV("Function", TheStore->getFunction()) - << " function"; + << " function" + << ore::setExtraArgs() + << ore::NV("FromBlock", TheStore->getParent()->getName()) + << ore::NV("ToBlock", Preheader->getName()); }); - // Okay, the memcpy has been formed. Zap the original store and anything that - // feeds into it. + // Okay, a new call to memcpy/memmove has been formed. Zap the original store + // and anything that feeds into it. if (MSSAU) MSSAU->removeMemoryAccess(TheStore, true); deleteDeadInstruction(TheStore); @@ -1549,24 +1671,22 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { CountInst = nullptr; - for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), - IterE = LoopEntry->end(); - Iter != IterE; Iter++) { - Instruction *Inst = &*Iter; - if (Inst->getOpcode() != Instruction::Add) + for (Instruction &Inst : llvm::make_range( + LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + if (Inst.getOpcode() != Instruction::Add) continue; - ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1)); if (!Inc || !Inc->isOne()) continue; - PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); if (!Phi) continue; // Check if the result of the instruction is live of the loop. bool LiveOutLoop = false; - for (User *U : Inst->users()) { + for (User *U : Inst.users()) { if ((cast<Instruction>(U))->getParent() != LoopEntry) { LiveOutLoop = true; break; @@ -1574,7 +1694,7 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, } if (LiveOutLoop) { - CountInst = Inst; + CountInst = &Inst; CountPhi = Phi; break; } @@ -1675,22 +1795,20 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, // plus "cnt0". Currently it is not optimized. // This step could be used to detect POPCNT instruction: // cnt.next = cnt + (x.next & 1) - for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), - IterE = LoopEntry->end(); - Iter != IterE; Iter++) { - Instruction *Inst = &*Iter; - if (Inst->getOpcode() != Instruction::Add) + for (Instruction &Inst : llvm::make_range( + LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + if (Inst.getOpcode() != Instruction::Add) continue; - ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1)); if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) continue; - PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); if (!Phi) continue; - CntInst = Inst; + CntInst = &Inst; CntPhi = Phi; break; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 3153a8721193..b9e63a4bc06f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -105,9 +105,7 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, if (!V || !LI.replacementPreservesLCSSAForm(&I, V)) continue; - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); - UI != UE;) { - Use &U = *UI++; + for (Use &U : llvm::make_early_inc_range(I.uses())) { auto *UserI = cast<Instruction>(U.getUser()); U.set(V); @@ -195,15 +193,10 @@ public: const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( *L->getHeader()->getParent()); - MemorySSA *MSSA = nullptr; - Optional<MemorySSAUpdater> MSSAU; - if (EnableMSSALoopDependency) { - MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); - } + MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); + MemorySSAUpdater MSSAU(MSSA); - return simplifyLoopInst(*L, DT, LI, AC, TLI, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); + return simplifyLoopInst(*L, DT, LI, AC, TLI, &MSSAU); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -211,10 +204,8 @@ public: AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.setPreservesCFG(); - if (EnableMSSALoopDependency) { - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addRequired<MemorySSAWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); getLoopAnalysisUsage(AU); } }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 34545f35b3c3..9f605b4ac4ad 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -1710,16 +1710,12 @@ bool LoopInterchangeTransform::adjustLoopBranches() { auto &OuterInnerReductions = LIL.getOuterInnerReductions(); // Now update the reduction PHIs in the inner and outer loop headers. SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; - for (PHINode &PHI : InnerLoopHeader->phis()) { - if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) - continue; - InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); - } - for (PHINode &PHI : OuterLoopHeader->phis()) { - if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) - continue; - OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); - } + for (PHINode &PHI : InnerLoopHeader->phis()) + if (OuterInnerReductions.contains(&PHI)) + InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); + for (PHINode &PHI : OuterLoopHeader->phis()) + if (OuterInnerReductions.contains(&PHI)) + OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); // Now move the remaining reduction PHIs from outer to inner loop header and // vice versa. The PHI nodes must be part of a reduction across the inner and @@ -1767,6 +1763,7 @@ bool LoopInterchangeTransform::adjustLoopLinks() { return Changed; } +namespace { /// Main LoopInterchange Pass. struct LoopInterchangeLegacyPass : public LoopPass { static char ID; @@ -1795,6 +1792,7 @@ struct LoopInterchangeLegacyPass : public LoopPass { return LoopInterchange(SE, LI, DI, DT, ORE).run(L); } }; +} // namespace char LoopInterchangeLegacyPass::ID = 0; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index aaf586173e44..21d59936616b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -34,7 +34,6 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -109,8 +108,8 @@ struct StoreToLoadForwardingCandidate { // Currently we only support accesses with unit stride. FIXME: we should be // able to handle non unit stirde as well as long as the stride is equal to // the dependence distance. - if (getPtrStride(PSE, LoadPtr, L) != 1 || - getPtrStride(PSE, StorePtr, L) != 1) + if (getPtrStride(PSE, LoadType, LoadPtr, L) != 1 || + getPtrStride(PSE, LoadType, StorePtr, L) != 1) return false; auto &DL = Load->getParent()->getModule()->getDataLayout(); @@ -718,15 +717,12 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F, auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; - MemorySSA *MSSA = EnableMSSALoopDependency - ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() - : nullptr; auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); bool Changed = eliminateLoadsAcrossLoops( F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, nullptr, nullptr}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index f4fce4871331..3df4cfe8e4c1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -10,6 +10,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" @@ -44,6 +45,18 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &, return PA; } +void PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &, + LPMUpdater &>::printPipeline(raw_ostream &OS, + function_ref<StringRef(StringRef)> + MapClassName2PassName) { + for (unsigned Idx = 0, Size = LoopPasses.size(); Idx != Size; ++Idx) { + auto *P = LoopPasses[Idx].get(); + P->printPipeline(OS, MapClassName2PassName); + if (Idx + 1 < Size) + OS << ","; + } +} + // Run both loop passes and loop-nest passes on top-level loop \p L. PreservedAnalyses LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM, @@ -112,12 +125,6 @@ LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM, // notify the updater, otherwise U.ParentL might gets outdated and triggers // assertion failures in addSiblingLoops and addChildLoops. U.setParentLoop(L.getParentLoop()); - - // FIXME: Historically, the pass managers all called the LLVM context's - // yield function here. We don't have a generic way to acquire the - // context and it isn't yet clear what the right pattern is for yielding - // in the new pass manager so it is currently omitted. - // ...getContext().yield(); } return PA; } @@ -161,17 +168,17 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM, // notify the updater, otherwise U.ParentL might gets outdated and triggers // assertion failures in addSiblingLoops and addChildLoops. U.setParentLoop(L.getParentLoop()); - - // FIXME: Historically, the pass managers all called the LLVM context's - // yield function here. We don't have a generic way to acquire the - // context and it isn't yet clear what the right pattern is for yielding - // in the new pass manager so it is currently omitted. - // ...getContext().yield(); } return PA; } } // namespace llvm +void FunctionToLoopPassAdaptor::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + OS << (UseMemorySSA ? "loop-mssa(" : "loop("); + Pass->printPipeline(OS, MapClassName2PassName); + OS << ")"; +} PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, FunctionAnalysisManager &AM) { // Before we even compute any loop analyses, first run a miniature function @@ -201,6 +208,10 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData() ? (&AM.getResult<BlockFrequencyAnalysis>(F)) : nullptr; + BranchProbabilityInfo *BPI = + UseBranchProbabilityInfo && F.hasProfileData() + ? (&AM.getResult<BranchProbabilityAnalysis>(F)) + : nullptr; LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F), AM.getResult<AssumptionAnalysis>(F), AM.getResult<DominatorTreeAnalysis>(F), @@ -209,6 +220,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, AM.getResult<TargetLibraryAnalysis>(F), AM.getResult<TargetIRAnalysis>(F), BFI, + BPI, MSSA}; // Setup the loop analysis manager from its proxy. It is important that @@ -285,6 +297,10 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, else PI.runAfterPass<Loop>(*Pass, *L, PassPA); + if (LAR.MSSA && !PassPA.getChecker<MemorySSAAnalysis>().preserved()) + report_fatal_error("Loop pass manager using MemorySSA contains a pass " + "that does not preserve MemorySSA"); + #ifndef NDEBUG // LoopAnalysisResults should always be valid. // Note that we don't LAR.SE.verify() because that can change observed SE @@ -325,6 +341,8 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, PA.preserve<ScalarEvolutionAnalysis>(); if (UseBlockFrequencyInfo && F.hasProfileData()) PA.preserve<BlockFrequencyAnalysis>(); + if (UseBranchProbabilityInfo && F.hasProfileData()) + PA.preserve<BranchProbabilityAnalysis>(); if (UseMemorySSA) PA.preserve<MemorySSAAnalysis>(); return PA; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp index 4f97641e2027..aa7e79a589f2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -183,6 +183,8 @@ #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Function.h" @@ -254,7 +256,7 @@ class LoopPredication { DominatorTree *DT; ScalarEvolution *SE; LoopInfo *LI; - BranchProbabilityInfo *BPI; + MemorySSAUpdater *MSSAU; Loop *L; const DataLayout *DL; @@ -302,16 +304,15 @@ class LoopPredication { // If the loop always exits through another block in the loop, we should not // predicate based on the latch check. For example, the latch check can be a // very coarse grained check and there can be more fine grained exit checks - // within the loop. We identify such unprofitable loops through BPI. + // within the loop. bool isLoopProfitableToPredicate(); bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); public: - LoopPredication(AliasAnalysis *AA, DominatorTree *DT, - ScalarEvolution *SE, LoopInfo *LI, - BranchProbabilityInfo *BPI) - : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {}; + LoopPredication(AliasAnalysis *AA, DominatorTree *DT, ScalarEvolution *SE, + LoopInfo *LI, MemorySSAUpdater *MSSAU) + : AA(AA), DT(DT), SE(SE), LI(LI), MSSAU(MSSAU){}; bool runOnLoop(Loop *L); }; @@ -325,6 +326,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<BranchProbabilityInfoWrapperPass>(); getLoopAnalysisUsage(AU); + AU.addPreserved<MemorySSAWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override { @@ -333,10 +335,12 @@ public: auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - BranchProbabilityInfo &BPI = - getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); + auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSAWP) + MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA()); auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - LoopPredication LP(AA, DT, SE, LI, &BPI); + LoopPredication LP(AA, DT, SE, LI, MSSAU ? MSSAU.get() : nullptr); return LP.runOnLoop(L); } }; @@ -358,16 +362,18 @@ Pass *llvm::createLoopPredicationPass() { PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - Function *F = L.getHeader()->getParent(); - // For the new PM, we also can't use BranchProbabilityInfo as an analysis - // pass. Function analyses need to be preserved across loop transformations - // but BPI is not preserved, hence a newly built one is needed. - BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI, &AR.DT, nullptr); - LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (AR.MSSA) + MSSAU = std::make_unique<MemorySSAUpdater>(AR.MSSA); + LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, + MSSAU ? MSSAU.get() : nullptr); if (!LP.runOnLoop(&L)) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + auto PA = getLoopPassPreservedAnalyses(); + if (AR.MSSA) + PA.preserve<MemorySSAAnalysis>(); + return PA; } Optional<LoopICmp> @@ -809,7 +815,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, Value *AllChecks = Builder.CreateAnd(Checks); auto *OldCond = Guard->getOperand(0); Guard->setOperand(0, AllChecks); - RecursivelyDeleteTriviallyDeadInstructions(OldCond); + RecursivelyDeleteTriviallyDeadInstructions(OldCond, nullptr /* TLI */, MSSAU); LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); return true; @@ -835,7 +841,7 @@ bool LoopPredication::widenWidenableBranchGuardConditions( Value *AllChecks = Builder.CreateAnd(Checks); auto *OldCond = BI->getCondition(); BI->setCondition(AllChecks); - RecursivelyDeleteTriviallyDeadInstructions(OldCond); + RecursivelyDeleteTriviallyDeadInstructions(OldCond, nullptr /* TLI */, MSSAU); assert(isGuardAsWidenableBranch(BI) && "Stopped being a guard after transform?"); @@ -912,7 +918,7 @@ Optional<LoopICmp> LoopPredication::parseLoopLatchICmp() { bool LoopPredication::isLoopProfitableToPredicate() { - if (SkipProfitabilityChecks || !BPI) + if (SkipProfitabilityChecks) return true; SmallVector<std::pair<BasicBlock *, BasicBlock *>, 8> ExitEdges; @@ -934,8 +940,61 @@ bool LoopPredication::isLoopProfitableToPredicate() { "expected to be an exiting block with 2 succs!"); unsigned LatchBrExitIdx = LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0; + // We compute branch probabilities without BPI. We do not rely on BPI since + // Loop predication is usually run in an LPM and BPI is only preserved + // lossily within loop pass managers, while BPI has an inherent notion of + // being complete for an entire function. + + // If the latch exits into a deoptimize or an unreachable block, do not + // predicate on that latch check. + auto *LatchExitBlock = LatchTerm->getSuccessor(LatchBrExitIdx); + if (isa<UnreachableInst>(LatchTerm) || + LatchExitBlock->getTerminatingDeoptimizeCall()) + return false; + + auto IsValidProfileData = [](MDNode *ProfileData, const Instruction *Term) { + if (!ProfileData || !ProfileData->getOperand(0)) + return false; + if (MDString *MDS = dyn_cast<MDString>(ProfileData->getOperand(0))) + if (!MDS->getString().equals("branch_weights")) + return false; + if (ProfileData->getNumOperands() != 1 + Term->getNumSuccessors()) + return false; + return true; + }; + MDNode *LatchProfileData = LatchTerm->getMetadata(LLVMContext::MD_prof); + // Latch terminator has no valid profile data, so nothing to check + // profitability on. + if (!IsValidProfileData(LatchProfileData, LatchTerm)) + return true; + + auto ComputeBranchProbability = + [&](const BasicBlock *ExitingBlock, + const BasicBlock *ExitBlock) -> BranchProbability { + auto *Term = ExitingBlock->getTerminator(); + MDNode *ProfileData = Term->getMetadata(LLVMContext::MD_prof); + unsigned NumSucc = Term->getNumSuccessors(); + if (IsValidProfileData(ProfileData, Term)) { + uint64_t Numerator = 0, Denominator = 0, ProfVal = 0; + for (unsigned i = 0; i < NumSucc; i++) { + ConstantInt *CI = + mdconst::extract<ConstantInt>(ProfileData->getOperand(i + 1)); + ProfVal = CI->getValue().getZExtValue(); + if (Term->getSuccessor(i) == ExitBlock) + Numerator += ProfVal; + Denominator += ProfVal; + } + return BranchProbability::getBranchProbability(Numerator, Denominator); + } else { + assert(LatchBlock != ExitingBlock && + "Latch term should always have profile data!"); + // No profile data, so we choose the weight as 1/num_of_succ(Src) + return BranchProbability::getBranchProbability(1, NumSucc); + } + }; + BranchProbability LatchExitProbability = - BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx); + ComputeBranchProbability(LatchBlock, LatchExitBlock); // Protect against degenerate inputs provided by the user. Providing a value // less than one, can invert the definition of profitable loop predication. @@ -948,18 +1007,18 @@ bool LoopPredication::isLoopProfitableToPredicate() { LLVM_DEBUG(dbgs() << "The value is set to 1.0\n"); ScaleFactor = 1.0; } - const auto LatchProbabilityThreshold = - LatchExitProbability * ScaleFactor; + const auto LatchProbabilityThreshold = LatchExitProbability * ScaleFactor; for (const auto &ExitEdge : ExitEdges) { BranchProbability ExitingBlockProbability = - BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second); + ComputeBranchProbability(ExitEdge.first, ExitEdge.second); // Some exiting edge has higher probability than the latch exiting edge. // No longer profitable to predicate. if (ExitingBlockProbability > LatchProbabilityThreshold) return false; } - // Using BPI, we have concluded that the most probable way to exit from the + + // We have concluded that the most probable way to exit from the // loop is through the latch (or there's no profile information and all // exits are equally likely). return true; @@ -1071,28 +1130,26 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // widen so that we gain ability to analyze it's exit count and perform this // transform. TODO: It'd be nice to know for sure the exit became // analyzeable after dropping widenability. - { - bool Invalidate = false; + bool ChangedLoop = false; - for (auto *ExitingBB : ExitingBlocks) { - if (LI->getLoopFor(ExitingBB) != L) - continue; + for (auto *ExitingBB : ExitingBlocks) { + if (LI->getLoopFor(ExitingBB) != L) + continue; - auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); - if (!BI) - continue; + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; - Use *Cond, *WC; - BasicBlock *IfTrueBB, *IfFalseBB; - if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) && - L->contains(IfTrueBB)) { - WC->set(ConstantInt::getTrue(IfTrueBB->getContext())); - Invalidate = true; - } + Use *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) && + L->contains(IfTrueBB)) { + WC->set(ConstantInt::getTrue(IfTrueBB->getContext())); + ChangedLoop = true; } - if (Invalidate) - SE->forgetLoop(L); } + if (ChangedLoop) + SE->forgetLoop(L); // The use of umin(all analyzeable exits) instead of latch is subtle, but // important for profitability. We may have a loop which hasn't been fully @@ -1104,18 +1161,24 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() || !SE->isLoopInvariant(MinEC, L) || !isSafeToExpandAt(MinEC, WidenableBR, *SE)) - return false; + return ChangedLoop; // Subtlety: We need to avoid inserting additional uses of the WC. We know // that it can only have one transitive use at the moment, and thus moving // that use to just before the branch and inserting code before it and then // modifying the operand is legal. auto *IP = cast<Instruction>(WidenableBR->getCondition()); + // Here we unconditionally modify the IR, so after this point we should return + // only `true`! IP->moveBefore(WidenableBR); + if (MSSAU) + if (auto *MUD = MSSAU->getMemorySSA()->getMemoryAccess(IP)) + MSSAU->moveToPlace(MUD, WidenableBR->getParent(), + MemorySSA::BeforeTerminator); Rewriter.setInsertPoint(IP); IRBuilder<> B(IP); - bool Changed = false; + bool InvalidateLoop = false; Value *MinECV = nullptr; // lazily generated if needed for (BasicBlock *ExitingBB : ExitingBlocks) { // If our exiting block exits multiple loops, we can only rewrite the @@ -1172,16 +1235,18 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { Value *OldCond = BI->getCondition(); BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue)); - Changed = true; + InvalidateLoop = true; } - if (Changed) + if (InvalidateLoop) // We just mutated a bunch of loop exits changing there exit counts // widely. We need to force recomputation of the exit counts given these // changes. Note that all of the inserted exits are never taken, and // should be removed next time the CFG is modified. SE->forgetLoop(L); - return Changed; + + // Always return `true` since we have moved the WidenableBR's condition. + return true; } bool LoopPredication::runOnLoop(Loop *Loop) { @@ -1242,5 +1307,8 @@ bool LoopPredication::runOnLoop(Loop *Loop) { for (auto *Guard : GuardsAsWidenableBranches) Changed |= widenWidenableBranchGuardConditions(Guard, Expander); Changed |= predicateLoopExits(L, Expander); + + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 6d5b19443c76..5ba137b1c85f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -99,8 +99,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - if (EnableMSSALoopDependency) - AU.addPreserved<MemorySSAWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); getLoopAnalysisUsage(AU); // Lazy BFI and BPI are marked as preserved here so LoopRotate @@ -121,13 +120,11 @@ public: auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); Optional<MemorySSAUpdater> MSSAU; - if (EnableMSSALoopDependency) { - // Not requiring MemorySSA and getting it only if available will split - // the loop pass pipeline when LoopRotate is being run first. - auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); - if (MSSAA) - MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); - } + // Not requiring MemorySSA and getting it only if available will split + // the loop pass pipeline when LoopRotate is being run first. + auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + if (MSSAA) + MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); // Vectorization requires loop-rotation. Use default threshold for loops the // user explicitly marked for vectorization, even when header duplication is // disabled. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index cc6d11220807..a87843d658a9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -733,13 +733,12 @@ public: DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); Optional<MemorySSAUpdater> MSSAU; - if (EnableMSSALoopDependency) { - MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - } + if (MSSAA) + MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); + if (MSSAA && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); bool DeleteCurrentLoop = false; bool Changed = simplifyLoopCFG( *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, @@ -750,10 +749,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - if (EnableMSSALoopDependency) { - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addPreserved<MemorySSAWrapperPass>(); AU.addPreserved<DependenceAnalysisWrapperPass>(); getLoopAnalysisUsage(AU); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp index a01287f587d7..c9c9e60d0921 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -323,15 +323,14 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, // Traverse preheader's instructions in reverse order becaue if A depends // on B (A appears after B), A needs to be sinked first before B can be // sinked. - for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) { - Instruction *I = &*II++; + for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { // No need to check for instruction's operands are loop invariant. - assert(L.hasLoopInvariantOperands(I) && + assert(L.hasLoopInvariantOperands(&I) && "Insts in a loop's preheader should have loop invariant operands!"); - if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false, + if (!canSinkOrHoistInst(I, &AA, &DT, &L, CurAST, MSSAU.get(), false, LICMFlags.get())) continue; - if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI, + if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI, MSSAU.get())) Changed = true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 404852f1dd4d..a9a2266e1196 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -136,6 +136,12 @@ using namespace llvm; /// worst cases before LSR burns too much compile time and stack space. static const unsigned MaxIVUsers = 200; +/// Limit the size of expression that SCEV-based salvaging will attempt to +/// translate into a DIExpression. +/// Choose a maximum size such that debuginfo is not excessively increased and +/// the salvaging is not too expensive for the compiler. +static const unsigned MaxSCEVSalvageExpressionSize = 64; + // Temporary flag to cleanup congruent phis after LSR phi expansion. // It's currently disabled until we can determine whether it's truly useful or // not. The flag should be removed after the v3.0 release. @@ -689,7 +695,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, const APInt &RA = RC->getAPInt(); // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do // some folding. - if (RA.isAllOnesValue()) { + if (RA.isAllOnes()) { if (LHS->getType()->isPointerTy()) return nullptr; return SE.getMulExpr(LHS, RC); @@ -2816,9 +2822,7 @@ static const SCEV *getExprBase(const SCEV *S) { // there's nothing more complex. // FIXME: not sure if we want to recognize negation. const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); - for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), - E(Add->op_begin()); I != E; ++I) { - const SCEV *SubExpr = *I; + for (const SCEV *SubExpr : reverse(Add->operands())) { if (SubExpr->getSCEVType() == scAddExpr) return getExprBase(SubExpr); @@ -3150,7 +3154,7 @@ void LSRInstance::CollectChains() { void LSRInstance::FinalizeChain(IVChain &Chain) { assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - + for (const IVInc &Inc : Chain) { LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); @@ -3385,7 +3389,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { // Mark uses whose expressions cannot be expanded. - if (!isSafeToExpand(S, SE)) + if (!isSafeToExpand(S, SE, /*CanonicalMode*/ false)) LU.RigidFormula = true; Formula F; @@ -3934,6 +3938,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Check each interesting stride. for (int64_t Factor : Factors) { + // Check that Factor can be represented by IntTy + if (!ConstantInt::isValueValidForType(IntTy, Factor)) + continue; // Check that the multiplication doesn't overflow. if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1) continue; @@ -4082,6 +4089,14 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { if (DstTy->isPointerTy()) return; + // It is invalid to extend a pointer type so exit early if ScaledReg or + // any of the BaseRegs are pointers. + if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy()) + return; + if (any_of(Base.BaseRegs, + [](const SCEV *S) { return S->getType()->isPointerTy(); })) + return; + for (Type *SrcTy : Types) { if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { Formula F = Base; @@ -5689,23 +5704,6 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, } } -#ifndef NDEBUG - // All dominating loops must have preheaders, or SCEVExpander may not be able - // to materialize an AddRecExpr whose Start is an outer AddRecExpr. - // - // IVUsers analysis should only create users that are dominated by simple loop - // headers. Since this loop should dominate all of its users, its user list - // should be empty if this loop itself is not within a simple loop nest. - for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); - Rung; Rung = Rung->getIDom()) { - BasicBlock *BB = Rung->getBlock(); - const Loop *DomLoop = LI.getLoopFor(BB); - if (DomLoop && DomLoop->getHeader() == BB) { - assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); - } - } -#endif // DEBUG - LLVM_DEBUG(dbgs() << "\nLSR on loop "; L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); dbgs() << ":\n"); @@ -5870,6 +5868,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<MemorySSAWrapperPass>(); } +namespace { struct SCEVDbgValueBuilder { SCEVDbgValueBuilder() = default; SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { @@ -6117,14 +6116,15 @@ struct DVIRecoveryRec { Metadata *LocationOp; const llvm::SCEV *SCEV; }; +} // namespace -static bool RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI, +static void RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI, const SCEVDbgValueBuilder &IterationCount, ScalarEvolution &SE) { // LSR may add locations to previously single location-op DVIs which // are currently not supported. if (CachedDVI.DVI->getNumVariableLocationOps() != 1) - return false; + return; // SCEVs for SSA values are most frquently of the form // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..). @@ -6132,48 +6132,70 @@ static bool RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI, // SCEVs have not been observed to result in debuginfo-lossy optimisations, // so its not expected this point will be reached. if (!isa<SCEVAddRecExpr>(CachedDVI.SCEV)) - return false; + return; LLVM_DEBUG(dbgs() << "scev-salvage: Value to salvage SCEV: " << *CachedDVI.SCEV << '\n'); const auto *Rec = cast<SCEVAddRecExpr>(CachedDVI.SCEV); if (!Rec->isAffine()) - return false; + return; + + if (CachedDVI.SCEV->getExpressionSize() > MaxSCEVSalvageExpressionSize) + return; // Initialise a new builder with the iteration count expression. In // combination with the value's SCEV this enables recovery. SCEVDbgValueBuilder RecoverValue(IterationCount); if (!RecoverValue.SCEVToValueExpr(*Rec, SE)) - return false; + return; LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *CachedDVI.DVI << '\n'); RecoverValue.applyExprToDbgValue(*CachedDVI.DVI, CachedDVI.Expr); LLVM_DEBUG(dbgs() << "scev-salvage: to: " << *CachedDVI.DVI << '\n'); - return true; } -static bool +static void RewriteDVIUsingOffset(DVIRecoveryRec &DVIRec, llvm::PHINode &IV, + int64_t Offset) { + assert(!DVIRec.DVI->hasArgList() && "Expected single location-op dbg.value."); + DbgValueInst *DVI = DVIRec.DVI; + SmallVector<uint64_t, 8> Ops; + DIExpression::appendOffset(Ops, Offset); + DIExpression *Expr = DIExpression::prependOpcodes(DVIRec.Expr, Ops, true); + LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *DVIRec.DVI << '\n'); + DVI->setExpression(Expr); + llvm::Value *ValIV = dyn_cast<llvm::Value>(&IV); + DVI->replaceVariableLocationOp( + 0u, llvm::MetadataAsValue::get(DVI->getContext(), + llvm::ValueAsMetadata::get(ValIV))); + LLVM_DEBUG(dbgs() << "scev-salvage: updated with offset to IV: " + << *DVIRec.DVI << '\n'); +} + +static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector<DVIRecoveryRec, 2> &DVIToUpdate) { if (DVIToUpdate.empty()) - return false; + return; const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar); assert(SCEVInductionVar && "Anticipated a SCEV for the post-LSR induction variable"); - bool Changed = false; if (const SCEVAddRecExpr *IVAddRec = dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) { if (!IVAddRec->isAffine()) - return false; + return; + if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize) + return; + + // The iteration count is required to recover location values. SCEVDbgValueBuilder IterCountExpr; IterCountExpr.pushValue(LSRInductionVar); if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE)) - return false; + return; LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar << '\n'); @@ -6196,14 +6218,26 @@ DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, DVIRec.DVI->setExpression(DVIRec.Expr); } - Changed |= RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE); + LLVM_DEBUG(dbgs() << "scev-salvage: value to recover SCEV: " + << *DVIRec.SCEV << '\n'); + + // Create a simple expression if the IV and value to salvage SCEVs + // start values differ by only a constant value. + if (Optional<APInt> Offset = + SE.computeConstantDifference(DVIRec.SCEV, SCEVInductionVar)) { + if (Offset.getValue().getMinSignedBits() <= 64) + RewriteDVIUsingOffset(DVIRec, *LSRInductionVar, + Offset.getValue().getSExtValue()); + } else { + RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE); + } } } - return Changed; } /// Identify and cache salvageable DVI locations and expressions along with the -/// corresponding SCEV(s). Also ensure that the DVI is not deleted before +/// corresponding SCEV(s). Also ensure that the DVI is not deleted between +/// cacheing and salvaging. static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector<DVIRecoveryRec, 2> &SalvageableDVISCEVs, @@ -6214,6 +6248,9 @@ DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, if (!DVI) continue; + if (DVI->isUndef()) + continue; + if (DVI->hasArgList()) continue; @@ -6221,6 +6258,16 @@ DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, !SE.isSCEVable(DVI->getVariableLocationOp(0)->getType())) continue; + // SCEVUnknown wraps an llvm::Value, it does not have a start and stride. + // Therefore no translation to DIExpression is performed. + const SCEV *S = SE.getSCEV(DVI->getVariableLocationOp(0)); + if (isa<SCEVUnknown>(S)) + continue; + + // Avoid wasting resources generating an expression containing undef. + if (SE.containsUndefs(S)) + continue; + SalvageableDVISCEVs.push_back( {DVI, DVI->getExpression(), DVI->getRawLocation(), SE.getSCEV(DVI->getVariableLocationOp(0))}); @@ -6234,33 +6281,32 @@ DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, /// surviving subsequent transforms. static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR) { - // For now, just pick the first IV generated and inserted. Ideally pick an IV - // that is unlikely to be optimised away by subsequent transforms. + + auto IsSuitableIV = [&](PHINode *P) { + if (!SE.isSCEVable(P->getType())) + return false; + if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P))) + return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P)); + return false; + }; + + // For now, just pick the first IV that was generated and inserted by + // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away + // by subsequent transforms. for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) { if (!IV) continue; - assert(isa<PHINode>(&*IV) && "Expected PhI node."); - if (SE.isSCEVable((*IV).getType())) { - PHINode *Phi = dyn_cast<PHINode>(&*IV); - LLVM_DEBUG(dbgs() << "scev-salvage: IV : " << *IV - << "with SCEV: " << *SE.getSCEV(Phi) << "\n"); - return Phi; - } - } + // There should only be PHI node IVs. + PHINode *P = cast<PHINode>(&*IV); - for (PHINode &Phi : L.getHeader()->phis()) { - if (!SE.isSCEVable(Phi.getType())) - continue; - - const llvm::SCEV *PhiSCEV = SE.getSCEV(&Phi); - if (const llvm::SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(PhiSCEV)) - if (!Rec->isAffine()) - continue; + if (IsSuitableIV(P)) + return P; + } - LLVM_DEBUG(dbgs() << "scev-salvage: Selected IV from loop header: " << Phi - << " with SCEV: " << *PhiSCEV << "\n"); - return Φ + for (PHINode &P : L.getHeader()->phis()) { + if (IsSuitableIV(&P)) + return &P; } return nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 71eb393fcdd7..1ecbb86724e1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -286,8 +286,8 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, AssumptionCache &AC, DependenceInfo &DI, OptimizationRemarkEmitter &ORE, int OptLevel) { TargetTransformInfo::UnrollingPreferences UP = - gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, - None, None, None, None, None); + gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, ORE, OptLevel, + None, None, None, None, None, None); TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(L, SE, TTI, None, None); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 49501f324a49..67702520511b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -184,7 +184,8 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max(); /// flags, TTI overrides and user specified parameters. TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + OptimizationRemarkEmitter &ORE, int OptLevel, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) { @@ -214,7 +215,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; // Override with any target specific settings - TTI.getUnrollingPreferences(L, SE, UP); + TTI.getUnrollingPreferences(L, SE, UP, &ORE); // Apply size attributes bool OptForSize = L->getHeader()->getParent()->hasOptSize() || @@ -318,6 +319,16 @@ struct EstimatedUnrollCost { unsigned RolledDynamicCost; }; +struct PragmaInfo { + PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU) + : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC), + PragmaEnableUnroll(PEU) {} + const bool UserUnrollCount; + const bool PragmaFullUnroll; + const unsigned PragmaCount; + const bool PragmaEnableUnroll; +}; + } // end anonymous namespace /// Figure out if the loop is worth full unrolling. @@ -746,13 +757,132 @@ public: // Returns loop size estimation for unrolled loop, given the unrolling // configuration specified by UP. - uint64_t getUnrolledLoopSize(TargetTransformInfo::UnrollingPreferences &UP) { + uint64_t + getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP, + const unsigned CountOverwrite = 0) const { assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); - return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; + if (CountOverwrite) + return static_cast<uint64_t>(LoopSize - UP.BEInsns) * CountOverwrite + + UP.BEInsns; + else + return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + + UP.BEInsns; } }; +static Optional<unsigned> +shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, + const unsigned TripMultiple, const unsigned TripCount, + const UnrollCostEstimator UCE, + const TargetTransformInfo::UnrollingPreferences &UP) { + + // Using unroll pragma + // 1st priority is unroll count set by "unroll-count" option. + + if (PInfo.UserUnrollCount) { + if (UP.AllowRemainder && + UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold) + return (unsigned)UnrollCount; + } + + // 2nd priority is unroll count set by pragma. + if (PInfo.PragmaCount > 0) { + if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)) && + UCE.getUnrolledLoopSize(UP, PInfo.PragmaCount) < PragmaUnrollThreshold) + return PInfo.PragmaCount; + } + + if (PInfo.PragmaFullUnroll && TripCount != 0) { + if (UCE.getUnrolledLoopSize(UP, TripCount) < PragmaUnrollThreshold) + return TripCount; + } + // if didn't return until here, should continue to other priorties + return None; +} + +static Optional<unsigned> shouldFullUnroll( + Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, + ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, + const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, + const TargetTransformInfo::UnrollingPreferences &UP) { + + if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { + // When computing the unrolled size, note that BEInsns are not replicated + // like the rest of the loop body. + if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) { + return FullUnrollTripCount; + + } else { + // The loop isn't that small, but we still can fully unroll it if that + // helps to remove a significant number of instructions. + // To check that, run additional analysis on the loop. + if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( + L, FullUnrollTripCount, DT, SE, EphValues, TTI, + UP.Threshold * UP.MaxPercentThresholdBoost / 100, + UP.MaxIterationsCountToAnalyze)) { + unsigned Boost = + getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); + if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { + return FullUnrollTripCount; + } + } + } + } + return None; +} + +static Optional<unsigned> +shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, + const UnrollCostEstimator UCE, + const TargetTransformInfo::UnrollingPreferences &UP) { + + unsigned count = UP.Count; + if (TripCount) { + if (!UP.Partial) { + LLVM_DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); + count = 0; + return count; + } + if (count == 0) + count = TripCount; + if (UP.PartialThreshold != NoThreshold) { + // Reduce unroll count to be modulo of TripCount for partial unrolling. + if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) + count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / + (LoopSize - UP.BEInsns); + if (count > UP.MaxCount) + count = UP.MaxCount; + while (count != 0 && TripCount % count != 0) + count--; + if (UP.AllowRemainder && count <= 1) { + // If there is no Count that is modulo of TripCount, set Count to + // largest power-of-two factor that satisfies the threshold limit. + // As we'll create fixup loop, do the type of unrolling only if + // remainder loop is allowed. + count = UP.DefaultUnrollRuntimeCount; + while (count != 0 && + UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) + count >>= 1; + } + if (count < 2) { + count = 0; + } + } else { + count = TripCount; + } + if (count > UP.MaxCount) + count = UP.MaxCount; + + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n"); + + return count; + } + + // if didn't return until here, should continue to other priorties + return None; +} // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. // Unless IgnoreUser is true, will also use metadata and command-line options @@ -770,7 +900,18 @@ bool llvm::computeUnrollCount( TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { UnrollCostEstimator UCE(*L, LoopSize); + Optional<unsigned> UnrollFactor; + + const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; + const bool PragmaFullUnroll = hasUnrollFullPragma(L); + const unsigned PragmaCount = unrollCountPragmaValue(L); + const bool PragmaEnableUnroll = hasUnrollEnablePragma(L); + const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || + PragmaEnableUnroll || UserUnrollCount; + + PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount, + PragmaEnableUnroll); // Use an explicit peel count that has been specified for testing. In this // case it's not permitted to also specify an explicit unroll count. if (PP.PeelCount) { @@ -782,47 +923,29 @@ bool llvm::computeUnrollCount( UP.Runtime = false; return true; } - // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. - bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; - if (UserUnrollCount) { - UP.Count = UnrollCount; - UP.AllowExpensiveTripCount = true; - UP.Force = true; - if (UP.AllowRemainder && UCE.getUnrolledLoopSize(UP) < UP.Threshold) - return true; - } - // 2nd priority is unroll count set by pragma. - unsigned PragmaCount = unrollCountPragmaValue(L); - if (PragmaCount > 0) { - UP.Count = PragmaCount; - UP.Runtime = true; - UP.AllowExpensiveTripCount = true; - UP.Force = true; - if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && - UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold) - return true; - } - bool PragmaFullUnroll = hasUnrollFullPragma(L); - if (PragmaFullUnroll && TripCount != 0) { - UP.Count = TripCount; - if (UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold) - return false; - } + UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP); + + if (UnrollFactor) { + UP.Count = *UnrollFactor; - bool PragmaEnableUnroll = hasUnrollEnablePragma(L); - bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || - PragmaEnableUnroll || UserUnrollCount; - - if (ExplicitUnroll && TripCount != 0) { - // If the loop has an unrolling pragma, we want to be more aggressive with - // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold - // value which is larger than the default limits. - UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); - UP.PartialThreshold = - std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); + if (UserUnrollCount || (PragmaCount > 0)) { + UP.AllowExpensiveTripCount = true; + UP.Force = true; + } + UP.Runtime |= (PragmaCount > 0); + return ExplicitUnroll; + } else { + if (ExplicitUnroll && TripCount != 0) { + // If the loop has an unrolling pragma, we want to be more aggressive with + // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold + // value which is larger than the default limits. + UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); + UP.PartialThreshold = + std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); + } } // 3rd priority is full unroll count. @@ -852,71 +975,55 @@ bool llvm::computeUnrollCount( unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; UP.Count = FullUnrollTripCount; - if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { - // When computing the unrolled size, note that BEInsns are not replicated - // like the rest of the loop body. - if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) { - UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); - return ExplicitUnroll; - } else { - // The loop isn't that small, but we still can fully unroll it if that - // helps to remove a significant number of instructions. - // To check that, run additional analysis on the loop. - if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, EphValues, TTI, - UP.Threshold * UP.MaxPercentThresholdBoost / 100, - UP.MaxIterationsCountToAnalyze)) { - unsigned Boost = - getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); - if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); - return ExplicitUnroll; - } - } - } + + UnrollFactor = + shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP); + + // if shouldFullUnroll can do the unrolling, some side parameteres should be + // set + if (UnrollFactor) { + UP.Count = *UnrollFactor; + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; + return ExplicitUnroll; + } else { + UP.Count = FullUnrollTripCount; } // 4th priority is loop peeling. - computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold); + computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold); if (PP.PeelCount) { UP.Runtime = false; UP.Count = 1; return ExplicitUnroll; } + // Before starting partial unrolling, set up.partial to true, + // if user explicitly asked for unrolling + if (TripCount) + UP.Partial |= ExplicitUnroll; + // 5th priority is partial unrolling. // Try partial unroll only when TripCount could be statically calculated. - if (TripCount) { - UP.Partial |= ExplicitUnroll; - if (!UP.Partial) { - LLVM_DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); - UP.Count = 0; - return false; - } - if (UP.Count == 0) - UP.Count = TripCount; + UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP); + + if (UnrollFactor) { + UP.Count = *UnrollFactor; + + if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + UP.Count != TripCount) + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, + "FullUnrollAsDirectedTooLarge", + L->getStartLoc(), L->getHeader()) + << "Unable to fully unroll loop as directed by unroll pragma " + "because " + "unrolled size is too large."; + }); + if (UP.PartialThreshold != NoThreshold) { - // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold) - UP.Count = - (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / - (LoopSize - UP.BEInsns); - if (UP.Count > UP.MaxCount) - UP.Count = UP.MaxCount; - while (UP.Count != 0 && TripCount % UP.Count != 0) - UP.Count--; - if (UP.AllowRemainder && UP.Count <= 1) { - // If there is no Count that is modulo of TripCount, set Count to - // largest power-of-two factor that satisfies the threshold limit. - // As we'll create fixup loop, do the type of unrolling only if - // remainder loop is allowed. - UP.Count = UP.DefaultUnrollRuntimeCount; - while (UP.Count != 0 && - UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold) - UP.Count >>= 1; - } - if (UP.Count < 2) { + if (UP.Count == 0) { if (PragmaEnableUnroll) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, @@ -926,25 +1033,8 @@ bool llvm::computeUnrollCount( "pragma " "because unrolled size is too large."; }); - UP.Count = 0; } - } else { - UP.Count = TripCount; } - if (UP.Count > UP.MaxCount) - UP.Count = UP.MaxCount; - if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && - UP.Count != TripCount) - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, - "FullUnrollAsDirectedTooLarge", - L->getStartLoc(), L->getHeader()) - << "Unable to fully unroll loop as directed by unroll pragma " - "because " - "unrolled size is too large."; - }); - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count - << "\n"); return ExplicitUnroll; } assert(TripCount == 0 && @@ -981,8 +1071,6 @@ bool llvm::computeUnrollCount( UP.AllowExpensiveTripCount = true; } } - - // Reduce count based on the type of unrolling and the threshold values. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; if (!UP.Runtime) { LLVM_DEBUG( @@ -1017,7 +1105,7 @@ bool llvm::computeUnrollCount( using namespace ore; - if (PragmaCount > 0 && !UP.AllowRemainder) + if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "DifferentUnrollCountFromDirected", @@ -1079,7 +1167,7 @@ static LoopUnrollResult tryToUnrollLoop( bool NotDuplicatable; bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, + L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedFullUnrollMaxCount); TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences( @@ -1529,3 +1617,25 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, return getLoopPassPreservedAnalyses(); } + +void LoopUnrollPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << "<"; + if (UnrollOpts.AllowPartial != None) + OS << (UnrollOpts.AllowPartial.getValue() ? "" : "no-") << "partial;"; + if (UnrollOpts.AllowPeeling != None) + OS << (UnrollOpts.AllowPeeling.getValue() ? "" : "no-") << "peeling;"; + if (UnrollOpts.AllowRuntime != None) + OS << (UnrollOpts.AllowRuntime.getValue() ? "" : "no-") << "runtime;"; + if (UnrollOpts.AllowUpperBound != None) + OS << (UnrollOpts.AllowUpperBound.getValue() ? "" : "no-") << "upperbound;"; + if (UnrollOpts.AllowProfileBasedPeeling != None) + OS << (UnrollOpts.AllowProfileBasedPeeling.getValue() ? "" : "no-") + << "profile-peeling;"; + if (UnrollOpts.FullUnrollMaxCount != None) + OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ";"; + OS << "O" << UnrollOpts.OptLevel; + OS << ">"; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 9a854ff80246..76bb5497c2c2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -232,10 +232,8 @@ namespace { AU.addPreserved<LazyBranchProbabilityInfoPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - if (EnableMSSALoopDependency) { - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addRequired<MemorySSAWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); if (HasBranchDivergence) AU.addRequired<LegacyDivergenceAnalysis>(); getLoopAnalysisUsage(AU); @@ -539,11 +537,8 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) { LPM = &LPMRef; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - if (EnableMSSALoopDependency) { - MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); - assert(DT && "Cannot update MemorySSA without a valid DomTree."); - } + MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); CurrentLoop = L; Function *F = CurrentLoop->getHeader()->getParent(); @@ -551,19 +546,19 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) { if (SanitizeMemory) SafetyInfo.computeLoopSafetyInfo(L); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); bool Changed = false; do { assert(CurrentLoop->isLCSSAForm(*DT)); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); RedoLoop = false; Changed |= processCurrentLoop(); } while (RedoLoop); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); return Changed; @@ -1312,8 +1307,7 @@ void LoopUnswitch::splitExitEdges( for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) { BasicBlock *ExitBlock = ExitBlocks[I]; - SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), - pred_end(ExitBlock)); + SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBlock)); // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp index bd3001988369..186065db327e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -55,11 +55,17 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II, Value *NewValue, DomTreeUpdater *DTU) { bool HasDeadBlocks = false; - SmallSetVector<Instruction *, 8> Worklist; + SmallSetVector<Instruction *, 8> UnsimplifiedUsers; replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr, - &Worklist); - for (auto I : Worklist) { - BranchInst *BI = dyn_cast<BranchInst>(I); + &UnsimplifiedUsers); + // UnsimplifiedUsers can contain PHI nodes that may be removed when + // replacing the branch instructions, so use a value handle worklist + // to handle those possibly removed instructions. + SmallVector<WeakVH, 8> Worklist(UnsimplifiedUsers.begin(), + UnsimplifiedUsers.end()); + + for (auto &VH : Worklist) { + BranchInst *BI = dyn_cast_or_null<BranchInst>(VH); if (!BI) continue; if (BI->isUnconditional()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index ead8082f3036..1c186e9a0488 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -357,11 +357,10 @@ static bool lowerExpectIntrinsic(Function &F) { // Remove llvm.expect intrinsics. Iterate backwards in order // to process select instructions before the intrinsic gets // removed. - for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) { - Instruction *Inst = &*BI++; - CallInst *CI = dyn_cast<CallInst>(Inst); + for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(BB))) { + CallInst *CI = dyn_cast<CallInst>(&Inst); if (!CI) { - if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) { + if (SelectInst *SI = dyn_cast<SelectInst>(&Inst)) { if (handleBrSelExpect(*SI)) ExpectIntrinsicsHandled++; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 42c183a6408e..4e4097e13271 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -900,8 +900,7 @@ public: // UndefedInsts and then check that we in fact remove them. SmallSet<Instruction *, 16> UndefedInsts; for (auto *Inst : reverse(ToRemove)) { - for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) { - Use &U = *I++; + for (Use &U : llvm::make_early_inc_range(Inst->uses())) { if (auto *Undefed = dyn_cast<Instruction>(U.getUser())) UndefedInsts.insert(Undefed); U.set(UndefValue::get(Inst->getType())); @@ -981,8 +980,9 @@ public: Value *EltPtr = createElementPtr(Ptr, EltTy, Builder); MatrixTy Result; for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) { - Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride, - Shape.getStride(), EltTy, Builder); + Value *GEP = computeVectorAddr( + EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I), + Stride, Shape.getStride(), EltTy, Builder); Value *Vector = Builder.CreateAlignedLoad( VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign), IsVolatile, "col.load"); @@ -1071,9 +1071,11 @@ public: auto VType = cast<VectorType>(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); for (auto Vec : enumerate(StoreVal.vectors())) { - Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()), - Stride, StoreVal.getStride(), - VType->getElementType(), Builder); + Value *GEP = computeVectorAddr( + EltPtr, + Builder.getIntN(Stride->getType()->getScalarSizeInBits(), + Vec.index()), + Stride, StoreVal.getStride(), VType->getElementType(), Builder); Builder.CreateAlignedStore(Vec.value(), GEP, getAlignForIndex(Vec.index(), Stride, VType->getElementType(), @@ -2261,6 +2263,16 @@ PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, return PreservedAnalyses::all(); } +void LowerMatrixIntrinsicsPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<LowerMatrixIntrinsicsPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << "<"; + if (Minimal) + OS << "minimal"; + OS << ">"; +} + namespace { class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9afbe0e9a2a5..67335a45fb58 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -22,7 +22,6 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" @@ -67,9 +66,10 @@ using namespace llvm; #define DEBUG_TYPE "memcpyopt" -static cl::opt<bool> - EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(true), cl::Hidden, - cl::desc("Use MemorySSA-backed MemCpyOpt.")); +static cl::opt<bool> EnableMemCpyOptWithoutLibcalls( + "enable-memcpyopt-without-libcalls", cl::init(false), cl::Hidden, + cl::ZeroOrMore, + cl::desc("Enable memcpyopt even when libcalls are disabled")); STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); @@ -282,13 +282,9 @@ private: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - if (!EnableMemorySSA) - AU.addRequired<MemoryDependenceWrapperPass>(); - AU.addPreserved<MemoryDependenceWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); - if (EnableMemorySSA) - AU.addRequired<MemorySSAWrapperPass>(); + AU.addRequired<MemorySSAWrapperPass>(); AU.addPreserved<MemorySSAWrapperPass>(); } }; @@ -304,7 +300,6 @@ INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) @@ -329,10 +324,7 @@ static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, } void MemCpyOptPass::eraseInstruction(Instruction *I) { - if (MSSAU) - MSSAU->removeMemoryAccess(I); - if (MD) - MD->removeInstruction(I); + MSSAU->removeMemoryAccess(I); I->eraseFromParent(); } @@ -394,14 +386,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, // memsets. MemoryDef *LastMemDef = nullptr; for (++BI; !BI->isTerminator(); ++BI) { - if (MSSAU) { - auto *CurrentAcc = cast_or_null<MemoryUseOrDef>( - MSSAU->getMemorySSA()->getMemoryAccess(&*BI)); - if (CurrentAcc) { - MemInsertPoint = CurrentAcc; - if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc)) - LastMemDef = CurrentDef; - } + auto *CurrentAcc = cast_or_null<MemoryUseOrDef>( + MSSAU->getMemorySSA()->getMemoryAccess(&*BI)); + if (CurrentAcc) { + MemInsertPoint = CurrentAcc; + if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc)) + LastMemDef = CurrentDef; } // Calls that only access inaccessible memory do not block merging @@ -503,19 +493,17 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); - if (MSSAU) { - assert(LastMemDef && MemInsertPoint && - "Both LastMemDef and MemInsertPoint need to be set"); - auto *NewDef = - cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI - ? MSSAU->createMemoryAccessBefore( - AMemSet, LastMemDef, MemInsertPoint) - : MSSAU->createMemoryAccessAfter( - AMemSet, LastMemDef, MemInsertPoint)); - MSSAU->insertDef(NewDef, /*RenameUses=*/true); - LastMemDef = NewDef; - MemInsertPoint = NewDef; - } + assert(LastMemDef && MemInsertPoint && + "Both LastMemDef and MemInsertPoint need to be set"); + auto *NewDef = + cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI + ? MSSAU->createMemoryAccessBefore( + AMemSet, LastMemDef, MemInsertPoint) + : MSSAU->createMemoryAccessAfter( + AMemSet, LastMemDef, MemInsertPoint)); + MSSAU->insertDef(NewDef, /*RenameUses=*/true); + LastMemDef = NewDef; + MemInsertPoint = NewDef; // Zap all the stores. for (Instruction *SI : Range.TheStores) @@ -624,17 +612,15 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) { // TODO: Simplify this once P will be determined by MSSA, in which case the // discrepancy can no longer occur. MemoryUseOrDef *MemInsertPoint = nullptr; - if (MSSAU) { - if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) { - MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator()); - } else { - const Instruction *ConstP = P; - for (const Instruction &I : make_range(++ConstP->getReverseIterator(), - ++LI->getReverseIterator())) { - if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) { - MemInsertPoint = MA; - break; - } + if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) { + MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator()); + } else { + const Instruction *ConstP = P; + for (const Instruction &I : make_range(++ConstP->getReverseIterator(), + ++LI->getReverseIterator())) { + if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) { + MemInsertPoint = MA; + break; } } } @@ -643,12 +629,10 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) { for (auto *I : llvm::reverse(ToLift)) { LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); I->moveBefore(P); - if (MSSAU) { - assert(MemInsertPoint && "Must have found insert point"); - if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) { - MSSAU->moveAfter(MA, MemInsertPoint); - MemInsertPoint = MA; - } + assert(MemInsertPoint && "Must have found insert point"); + if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) { + MSSAU->moveAfter(MA, MemInsertPoint); + MemInsertPoint = MA; } } @@ -682,7 +666,13 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LI->getParent() == SI->getParent()) { auto *T = LI->getType(); - if (T->isAggregateType()) { + // Don't introduce calls to memcpy/memmove intrinsics out of thin air if + // the corresponding libcalls are not available. + // TODO: We should really distinguish between libcall availability and + // our ability to introduce intrinsics. + if (T->isAggregateType() && + (EnableMemCpyOptWithoutLibcalls || + (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) { MemoryLocation LoadLoc = MemoryLocation::get(LI); // We use alias analysis to check if an instruction may store to @@ -712,9 +702,10 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (P) { // If we load from memory that may alias the memory we store to, // memmove must be used to preserve semantic. If not, memcpy can - // be used. + // be used. Also, if we load from constant memory, memcpy can be used + // as the constant memory won't be modified. bool UseMemMove = false; - if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc)) + if (isModSet(AA->getModRefInfo(SI, LoadLoc))) UseMemMove = true; uint64_t Size = DL.getTypeStoreSize(T); @@ -733,13 +724,10 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M << "\n"); - if (MSSAU) { - auto *LastDef = - cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)); - auto *NewAccess = - MSSAU->createMemoryAccessAfter(M, LastDef, LastDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); - } + auto *LastDef = + cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)); + auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); eraseInstruction(SI); eraseInstruction(LI); @@ -755,38 +743,21 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // happen to be using a load-store pair to implement it, rather than // a memcpy. CallInst *C = nullptr; - if (EnableMemorySSA) { - if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( - MSSA->getWalker()->getClobberingMemoryAccess(LI))) { - // The load most post-dom the call. Limit to the same block for now. - // TODO: Support non-local call-slot optimization? - if (LoadClobber->getBlock() == SI->getParent()) - C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst()); - } - } else { - MemDepResult ldep = MD->getDependency(LI); - if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) - C = dyn_cast<CallInst>(ldep.getInst()); + if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( + MSSA->getWalker()->getClobberingMemoryAccess(LI))) { + // The load most post-dom the call. Limit to the same block for now. + // TODO: Support non-local call-slot optimization? + if (LoadClobber->getBlock() == SI->getParent()) + C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst()); } if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. MemoryLocation StoreLoc = MemoryLocation::get(SI); - if (EnableMemorySSA) { - if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C), - MSSA->getMemoryAccess(SI))) - C = nullptr; - } else { - for (BasicBlock::iterator I = --SI->getIterator(), - E = C->getIterator(); - I != E; --I) { - if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) { - C = nullptr; - break; - } - } - } + if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C), + MSSA->getMemoryAccess(SI))) + C = nullptr; } if (C) { @@ -805,6 +776,13 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } } + // The following code creates memset intrinsics out of thin air. Don't do + // this if the corresponding libfunc is not available. + // TODO: We should really distinguish between libcall availability and + // our ability to introduce intrinsics. + if (!(TLI->has(LibFunc_memset) || EnableMemCpyOptWithoutLibcalls)) + return false; + // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. @@ -831,13 +809,12 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); - if (MSSAU) { - assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI))); - auto *LastDef = - cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)); - auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); - } + // The newly inserted memset is immediately overwritten by the original + // store, so we do not need to rename uses. + auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI)); + auto *NewAccess = MSSAU->createMemoryAccessBefore( + M, StoreDef->getDefiningAccess(), StoreDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false); eraseInstruction(SI); NumMemSetInfer++; @@ -1033,11 +1010,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); } - // Drop any cached information about the call, because we may have changed - // its dependence information by changing its parameter. - if (MD) - MD->removeInstruction(C); - // Update AA metadata // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet @@ -1086,28 +1058,19 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // // TODO: If the code between M and MDep is transparent to the destination "c", // then we could still perform the xform by moving M up to the first memcpy. - if (EnableMemorySSA) { - // TODO: It would be sufficient to check the MDep source up to the memcpy - // size of M, rather than MDep. - if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep), - MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M))) - return false; - } else { - // NOTE: This is conservative, it will stop on any read from the source loc, - // not just the defining memcpy. - MemDepResult SourceDep = - MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, - M->getIterator(), M->getParent()); - if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) - return false; - } + // TODO: It would be sufficient to check the MDep source up to the memcpy + // size of M, rather than MDep. + if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep), + MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M))) + return false; // If the dest of the second might alias the source of the first, then the - // source and dest might overlap. We still want to eliminate the intermediate - // value, but we have to generate a memmove instead of memcpy. + // source and dest might overlap. In addition, if the source of the first + // points to constant memory, they won't overlap by definition. Otherwise, we + // still want to eliminate the intermediate value, but we have to generate a + // memmove instead of memcpy. bool UseMemMove = false; - if (!AA->isNoAlias(MemoryLocation::getForDest(M), - MemoryLocation::getForSource(MDep))) + if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(MDep)))) UseMemMove = true; // If all checks passed, then we can transform M. @@ -1134,12 +1097,10 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); - if (MSSAU) { - assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M))); - auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)); - auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); - } + assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M))); + auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)); + auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); // Remove the instruction we're replacing. eraseInstruction(M); @@ -1169,30 +1130,16 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, // Check that src and dst of the memcpy aren't the same. While memcpy // operands cannot partially overlap, exact equality is allowed. - if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(), - LocationSize::precise(1)), - MemoryLocation(MemCpy->getDest(), - LocationSize::precise(1)))) + if (isModSet(AA->getModRefInfo(MemCpy, MemoryLocation::getForSource(MemCpy)))) return false; - if (EnableMemorySSA) { - // We know that dst up to src_size is not written. We now need to make sure - // that dst up to dst_size is not accessed. (If we did not move the memset, - // checking for reads would be sufficient.) - if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet), - MSSA->getMemoryAccess(MemSet), - MSSA->getMemoryAccess(MemCpy))) { - return false; - } - } else { - // We have already checked that dst up to src_size is not accessed. We - // need to make sure that there are no accesses up to dst_size either. - MemDepResult DstDepInfo = MD->getPointerDependencyFrom( - MemoryLocation::getForDest(MemSet), false, MemCpy->getIterator(), - MemCpy->getParent()); - if (DstDepInfo.getInst() != MemSet) - return false; - } + // We know that dst up to src_size is not written. We now need to make sure + // that dst up to dst_size is not accessed. (If we did not move the memset, + // checking for reads would be sufficient.) + if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet), + MSSA->getMemoryAccess(MemSet), + MSSA->getMemoryAccess(MemCpy))) + return false; // Use the same i8* dest as the memcpy, killing the memset dest if different. Value *Dest = MemCpy->getRawDest(); @@ -1242,18 +1189,16 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, SrcSize), MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); - if (MSSAU) { - assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && - "MemCpy must be a MemoryDef"); - // The new memset is inserted after the memcpy, but it is known that its - // defining access is the memset about to be removed which immediately - // precedes the memcpy. - auto *LastDef = - cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); - auto *NewAccess = MSSAU->createMemoryAccessBefore( - NewMemSet, LastDef->getDefiningAccess(), LastDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); - } + assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && + "MemCpy must be a MemoryDef"); + // The new memset is inserted after the memcpy, but it is known that its + // defining access is the memset about to be removed which immediately + // precedes the memcpy. + auto *LastDef = + cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); + auto *NewAccess = MSSAU->createMemoryAccessBefore( + NewMemSet, LastDef->getDefiningAccess(), LastDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); eraseInstruction(MemSet); return true; @@ -1261,23 +1206,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, /// Determine whether the instruction has undefined content for the given Size, /// either because it was freshly alloca'd or started its lifetime. -static bool hasUndefContents(Instruction *I, Value *Size) { - if (isa<AllocaInst>(I)) - return true; - - if (ConstantInt *CSize = dyn_cast<ConstantInt>(Size)) { - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) - if (LTSize->getZExtValue() >= CSize->getZExtValue()) - return true; - } - - return false; -} - -static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, - MemoryDef *Def, Value *Size) { +static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, + MemoryDef *Def, Value *Size) { if (MSSA->isLiveOnEntryDef(Def)) return isa<AllocaInst>(getUnderlyingObject(V)); @@ -1351,19 +1281,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, // easily represent this location, we use the full 0..CopySize range. MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); bool CanReduceSize = false; - if (EnableMemorySSA) { - MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet); - MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( - MemSetAccess->getDefiningAccess(), MemCpyLoc); - if (auto *MD = dyn_cast<MemoryDef>(Clobber)) - if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize)) - CanReduceSize = true; - } else { - MemDepResult DepInfo = MD->getPointerDependencyFrom( - MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent()); - if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) + MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet); + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + MemSetAccess->getDefiningAccess(), MemCpyLoc); + if (auto *MD = dyn_cast<MemoryDef>(Clobber)) + if (hasUndefContents(MSSA, AA, MemCpy->getSource(), MD, CopySize)) CanReduceSize = true; - } if (!CanReduceSize) return false; @@ -1375,12 +1298,10 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, Instruction *NewM = Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize, MaybeAlign(MemCpy->getDestAlignment())); - if (MSSAU) { - auto *LastDef = - cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); - auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); - } + auto *LastDef = + cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); + auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); return true; } @@ -1410,151 +1331,90 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { Instruction *NewM = Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), MaybeAlign(M->getDestAlignment()), false); - if (MSSAU) { - auto *LastDef = - cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)); - auto *NewAccess = - MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); - } + auto *LastDef = + cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)); + auto *NewAccess = + MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); eraseInstruction(M); ++NumCpyToSet; return true; } - if (EnableMemorySSA) { - MemoryUseOrDef *MA = MSSA->getMemoryAccess(M); - MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA); - MemoryLocation DestLoc = MemoryLocation::getForDest(M); - const MemoryAccess *DestClobber = - MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc); - - // Try to turn a partially redundant memset + memcpy into - // memcpy + smaller memset. We don't need the memcpy size for this. - // The memcpy most post-dom the memset, so limit this to the same basic - // block. A non-local generalization is likely not worthwhile. - if (auto *MD = dyn_cast<MemoryDef>(DestClobber)) - if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst())) - if (DestClobber->getBlock() == M->getParent()) - if (processMemSetMemCpyDependence(M, MDep)) - return true; - - MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( - AnyClobber, MemoryLocation::getForSource(M)); - - // There are four possible optimizations we can do for memcpy: - // a) memcpy-memcpy xform which exposes redundance for DSE. - // b) call-memcpy xform for return slot optimization. - // c) memcpy from freshly alloca'd space or space that has just started - // its lifetime copies undefined data, and we can therefore eliminate - // the memcpy in favor of the data that was already at the destination. - // d) memcpy from a just-memset'd source can be turned into memset. - if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) { - if (Instruction *MI = MD->getMemoryInst()) { - if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) { - if (auto *C = dyn_cast<CallInst>(MI)) { - // The memcpy must post-dom the call. Limit to the same block for - // now. Additionally, we need to ensure that there are no accesses - // to dest between the call and the memcpy. Accesses to src will be - // checked by performCallSlotOptzn(). - // TODO: Support non-local call-slot optimization? - if (C->getParent() == M->getParent() && - !accessedBetween(*AA, DestLoc, MD, MA)) { - // FIXME: Can we pass in either of dest/src alignment here instead - // of conservatively taking the minimum? - Align Alignment = std::min(M->getDestAlign().valueOrOne(), - M->getSourceAlign().valueOrOne()); - if (performCallSlotOptzn( - M, M, M->getDest(), M->getSource(), - TypeSize::getFixed(CopySize->getZExtValue()), Alignment, - C)) { - LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" - << " call: " << *C << "\n" - << " memcpy: " << *M << "\n"); - eraseInstruction(M); - ++NumMemCpyInstr; - return true; - } - } - } - } - if (auto *MDep = dyn_cast<MemCpyInst>(MI)) - return processMemCpyMemCpyDependence(M, MDep); - if (auto *MDep = dyn_cast<MemSetInst>(MI)) { - if (performMemCpyToMemSetOptzn(M, MDep)) { - LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n"); - eraseInstruction(M); - ++NumCpyToSet; - return true; - } - } - } - - if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, M->getLength())) { - LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n"); - eraseInstruction(M); - ++NumMemCpyInstr; - return true; - } - } - } else { - MemDepResult DepInfo = MD->getDependency(M); - - // Try to turn a partially redundant memset + memcpy into - // memcpy + smaller memset. We don't need the memcpy size for this. - if (DepInfo.isClobber()) - if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst())) + MemoryUseOrDef *MA = MSSA->getMemoryAccess(M); + MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA); + MemoryLocation DestLoc = MemoryLocation::getForDest(M); + const MemoryAccess *DestClobber = + MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc); + + // Try to turn a partially redundant memset + memcpy into + // memcpy + smaller memset. We don't need the memcpy size for this. + // The memcpy most post-dom the memset, so limit this to the same basic + // block. A non-local generalization is likely not worthwhile. + if (auto *MD = dyn_cast<MemoryDef>(DestClobber)) + if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst())) + if (DestClobber->getBlock() == M->getParent()) if (processMemSetMemCpyDependence(M, MDep)) return true; - // There are four possible optimizations we can do for memcpy: - // a) memcpy-memcpy xform which exposes redundance for DSE. - // b) call-memcpy xform for return slot optimization. - // c) memcpy from freshly alloca'd space or space that has just started - // its lifetime copies undefined data, and we can therefore eliminate - // the memcpy in favor of the data that was already at the destination. - // d) memcpy from a just-memset'd source can be turned into memset. - if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) { - if (DepInfo.isClobber()) { - if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { - // FIXME: Can we pass in either of dest/src alignment here instead - // of conservatively taking the minimum? - Align Alignment = std::min(M->getDestAlign().valueOrOne(), - M->getSourceAlign().valueOrOne()); - if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(), - TypeSize::getFixed(CopySize->getZExtValue()), - Alignment, C)) { - eraseInstruction(M); - ++NumMemCpyInstr; - return true; + MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( + AnyClobber, MemoryLocation::getForSource(M)); + + // There are four possible optimizations we can do for memcpy: + // a) memcpy-memcpy xform which exposes redundance for DSE. + // b) call-memcpy xform for return slot optimization. + // c) memcpy from freshly alloca'd space or space that has just started + // its lifetime copies undefined data, and we can therefore eliminate + // the memcpy in favor of the data that was already at the destination. + // d) memcpy from a just-memset'd source can be turned into memset. + if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) { + if (Instruction *MI = MD->getMemoryInst()) { + if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) { + if (auto *C = dyn_cast<CallInst>(MI)) { + // The memcpy must post-dom the call. Limit to the same block for + // now. Additionally, we need to ensure that there are no accesses + // to dest between the call and the memcpy. Accesses to src will be + // checked by performCallSlotOptzn(). + // TODO: Support non-local call-slot optimization? + if (C->getParent() == M->getParent() && + !accessedBetween(*AA, DestLoc, MD, MA)) { + // FIXME: Can we pass in either of dest/src alignment here instead + // of conservatively taking the minimum? + Align Alignment = std::min(M->getDestAlign().valueOrOne(), + M->getSourceAlign().valueOrOne()); + if (performCallSlotOptzn( + M, M, M->getDest(), M->getSource(), + TypeSize::getFixed(CopySize->getZExtValue()), Alignment, + C)) { + LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" + << " call: " << *C << "\n" + << " memcpy: " << *M << "\n"); + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } } } } - } - - MemoryLocation SrcLoc = MemoryLocation::getForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( - SrcLoc, true, M->getIterator(), M->getParent()); - - if (SrcDepInfo.isClobber()) { - if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) + if (auto *MDep = dyn_cast<MemCpyInst>(MI)) return processMemCpyMemCpyDependence(M, MDep); - } else if (SrcDepInfo.isDef()) { - if (hasUndefContents(SrcDepInfo.getInst(), M->getLength())) { - eraseInstruction(M); - ++NumMemCpyInstr; - return true; - } - } - - if (SrcDepInfo.isClobber()) - if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst())) + if (auto *MDep = dyn_cast<MemSetInst>(MI)) { if (performMemCpyToMemSetOptzn(M, MDep)) { + LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n"); eraseInstruction(M); ++NumCpyToSet; return true; } + } + } + + if (hasUndefContents(MSSA, AA, M->getSource(), MD, M->getLength())) { + LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n"); + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } } return false; @@ -1563,12 +1423,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed /// not to alias. bool MemCpyOptPass::processMemMove(MemMoveInst *M) { - if (!TLI->has(LibFunc_memmove)) - return false; - - // See if the pointers alias. - if (!AA->isNoAlias(MemoryLocation::getForDest(M), - MemoryLocation::getForSource(M))) + // See if the source could be modified by this memmove potentially. + if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M)))) return false; LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M @@ -1584,11 +1440,6 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) { // For MemorySSA nothing really changes (except that memcpy may imply stricter // aliasing guarantees). - // MemDep may have over conservative information about this instruction, just - // conservatively flush it from the cache. - if (MD) - MD->removeInstruction(M); - ++NumMoveToCpy; return true; } @@ -1601,22 +1452,14 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { Type *ByValTy = CB.getParamByValType(ArgNo); TypeSize ByValSize = DL.getTypeAllocSize(ByValTy); MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize)); + MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB); + if (!CallAccess) + return false; MemCpyInst *MDep = nullptr; - if (EnableMemorySSA) { - MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB); - if (!CallAccess) - return false; - MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( - CallAccess->getDefiningAccess(), Loc); - if (auto *MD = dyn_cast<MemoryDef>(Clobber)) - MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst()); - } else { - MemDepResult DepInfo = MD->getPointerDependencyFrom( - Loc, true, CB.getIterator(), CB.getParent()); - if (!DepInfo.isClobber()) - return false; - MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); - } + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + CallAccess->getDefiningAccess(), Loc); + if (auto *MD = dyn_cast<MemoryDef>(Clobber)) + MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst()); // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by // a memcpy, see if we can byval from the source of the memcpy instead of the @@ -1655,19 +1498,9 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { // *b = 42; // foo(*a) // It would be invalid to transform the second memcpy into foo(*b). - if (EnableMemorySSA) { - if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep), - MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB))) - return false; - } else { - // NOTE: This is conservative, it will stop on any read from the source loc, - // not just the defining memcpy. - MemDepResult SourceDep = MD->getPointerDependencyFrom( - MemoryLocation::getForSource(MDep), false, - CB.getIterator(), MDep->getParent()); - if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) - return false; - } + if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep), + MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB))) + return false; Value *TmpCast = MDep->getSource(); if (MDep->getSource()->getType() != ByValArg->getType()) { @@ -1734,47 +1567,33 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { } PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { - auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F) - : AM.getCachedResult<MemoryDependenceAnalysis>(F); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto *AA = &AM.getResult<AAManager>(F); auto *AC = &AM.getResult<AssumptionAnalysis>(F); auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); - auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F) - : AM.getCachedResult<MemorySSAAnalysis>(F); + auto *MSSA = &AM.getResult<MemorySSAAnalysis>(F); - bool MadeChange = - runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr); + bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA()); if (!MadeChange) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); - if (MD) - PA.preserve<MemoryDependenceAnalysis>(); - if (MSSA) - PA.preserve<MemorySSAAnalysis>(); + PA.preserve<MemorySSAAnalysis>(); return PA; } -bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_, - TargetLibraryInfo *TLI_, AliasAnalysis *AA_, - AssumptionCache *AC_, DominatorTree *DT_, - MemorySSA *MSSA_) { +bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, + AliasAnalysis *AA_, AssumptionCache *AC_, + DominatorTree *DT_, MemorySSA *MSSA_) { bool MadeChange = false; - MD = MD_; TLI = TLI_; AA = AA_; AC = AC_; DT = DT_; MSSA = MSSA_; MemorySSAUpdater MSSAU_(MSSA_); - MSSAU = MSSA_ ? &MSSAU_ : nullptr; - // If we don't have at least memset and memcpy, there is little point of doing - // anything here. These are required by a freestanding implementation, so if - // even they are disabled, there is no point in trying hard. - if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy)) - return false; + MSSAU = &MSSAU_; while (true) { if (!iterateOnFunction(F)) @@ -1782,10 +1601,9 @@ bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_, MadeChange = true; } - if (MSSA_ && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA_->verifyMemorySSA(); - MD = nullptr; return MadeChange; } @@ -1794,17 +1612,11 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; - auto *MDWP = !EnableMemorySSA - ? &getAnalysis<MemoryDependenceWrapperPass>() - : getAnalysisIfAvailable<MemoryDependenceWrapperPass>(); auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *MSSAWP = EnableMemorySSA - ? &getAnalysis<MemorySSAWrapperPass>() - : getAnalysisIfAvailable<MemorySSAWrapperPass>(); + auto *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT, - MSSAWP ? &MSSAWP->getMSSA() : nullptr); + return Impl.runImpl(F, TLI, AA, AC, DT, MSSA); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp index f13f24ad2027..aac0deea5be3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -66,15 +66,6 @@ namespace { #define DEBUG_TYPE "mergeicmps" -// Returns true if the instruction is a simple load or a simple store -static bool isSimpleLoadOrStore(const Instruction *I) { - if (const LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->isSimple(); - if (const StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->isSimple(); - return false; -} - // A BCE atom "Binary Compare Expression Atom" represents an integer load // that is a constant offset from a base value, e.g. `a` or `o.c` in the example // at the top. @@ -154,6 +145,10 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { return {}; } Value *const Addr = LoadI->getOperand(0); + if (Addr->getType()->getPointerAddressSpace() != 0) { + LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n"); + return {}; + } auto *const GEP = dyn_cast<GetElementPtrInst>(Addr); if (!GEP) return {}; @@ -234,6 +229,8 @@ class BCECmpBlock { InstructionSet BlockInsts; // The block requires splitting. bool RequireSplit = false; + // Original order of this block in the chain. + unsigned OrigOrder = 0; private: BCECmp Cmp; @@ -244,14 +241,13 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, // If this instruction may clobber the loads and is in middle of the BCE cmp // block instructions, then bail for now. if (Inst->mayWriteToMemory()) { - // Bail if this is not a simple load or store - if (!isSimpleLoadOrStore(Inst)) - return false; - // Disallow stores that might alias the BCE operands - MemoryLocation LLoc = MemoryLocation::get(Cmp.Lhs.LoadI); - MemoryLocation RLoc = MemoryLocation::get(Cmp.Rhs.LoadI); - if (isModSet(AA.getModRefInfo(Inst, LLoc)) || - isModSet(AA.getModRefInfo(Inst, RLoc))) + auto MayClobber = [&](LoadInst *LI) { + // If a potentially clobbering instruction comes before the load, + // we can still safely sink the load. + return !Inst->comesBefore(LI) && + isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI))); + }; + if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI)) return false; } // Make sure this instruction does not use any of the BCE cmp block @@ -386,39 +382,83 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons, << Comparison.Rhs().BaseId << " + " << Comparison.Rhs().Offset << "\n"); LLVM_DEBUG(dbgs() << "\n"); + Comparison.OrigOrder = Comparisons.size(); Comparisons.push_back(std::move(Comparison)); } // A chain of comparisons. class BCECmpChain { - public: - BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, - AliasAnalysis &AA); - - int size() const { return Comparisons_.size(); } +public: + using ContiguousBlocks = std::vector<BCECmpBlock>; -#ifdef MERGEICMPS_DOT_ON - void dump() const; -#endif // MERGEICMPS_DOT_ON + BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, + AliasAnalysis &AA); bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, DomTreeUpdater &DTU); -private: - static bool IsContiguous(const BCECmpBlock &First, - const BCECmpBlock &Second) { - return First.Lhs().BaseId == Second.Lhs().BaseId && - First.Rhs().BaseId == Second.Rhs().BaseId && - First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && - First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; + bool atLeastOneMerged() const { + return any_of(MergedBlocks_, + [](const auto &Blocks) { return Blocks.size() > 1; }); } +private: PHINode &Phi_; - std::vector<BCECmpBlock> Comparisons_; + // The list of all blocks in the chain, grouped by contiguity. + std::vector<ContiguousBlocks> MergedBlocks_; // The original entry block (before sorting); BasicBlock *EntryBlock_; }; +static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) { + return First.Lhs().BaseId == Second.Lhs().BaseId && + First.Rhs().BaseId == Second.Rhs().BaseId && + First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && + First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; +} + +static unsigned getMinOrigOrder(const BCECmpChain::ContiguousBlocks &Blocks) { + unsigned MinOrigOrder = std::numeric_limits<unsigned>::max(); + for (const BCECmpBlock &Block : Blocks) + MinOrigOrder = std::min(MinOrigOrder, Block.OrigOrder); + return MinOrigOrder; +} + +/// Given a chain of comparison blocks, groups the blocks into contiguous +/// ranges that can be merged together into a single comparison. +static std::vector<BCECmpChain::ContiguousBlocks> +mergeBlocks(std::vector<BCECmpBlock> &&Blocks) { + std::vector<BCECmpChain::ContiguousBlocks> MergedBlocks; + + // Sort to detect continuous offsets. + llvm::sort(Blocks, + [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) { + return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) < + std::tie(RhsBlock.Lhs(), RhsBlock.Rhs()); + }); + + BCECmpChain::ContiguousBlocks *LastMergedBlock = nullptr; + for (BCECmpBlock &Block : Blocks) { + if (!LastMergedBlock || !areContiguous(LastMergedBlock->back(), Block)) { + MergedBlocks.emplace_back(); + LastMergedBlock = &MergedBlocks.back(); + } else { + LLVM_DEBUG(dbgs() << "Merging block " << Block.BB->getName() << " into " + << LastMergedBlock->back().BB->getName() << "\n"); + } + LastMergedBlock->push_back(std::move(Block)); + } + + // While we allow reordering for merging, do not reorder unmerged comparisons. + // Doing so may introduce branch on poison. + llvm::sort(MergedBlocks, [](const BCECmpChain::ContiguousBlocks &LhsBlocks, + const BCECmpChain::ContiguousBlocks &RhsBlocks) { + return getMinOrigOrder(LhsBlocks) < getMinOrigOrder(RhsBlocks); + }); + + return MergedBlocks; +} + BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, AliasAnalysis &AA) : Phi_(Phi) { @@ -498,47 +538,9 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, return; } EntryBlock_ = Comparisons[0].BB; - Comparisons_ = std::move(Comparisons); -#ifdef MERGEICMPS_DOT_ON - errs() << "BEFORE REORDERING:\n\n"; - dump(); -#endif // MERGEICMPS_DOT_ON - // Reorder blocks by LHS. We can do that without changing the - // semantics because we are only accessing dereferencable memory. - llvm::sort(Comparisons_, - [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) { - return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) < - std::tie(RhsBlock.Lhs(), RhsBlock.Rhs()); - }); -#ifdef MERGEICMPS_DOT_ON - errs() << "AFTER REORDERING:\n\n"; - dump(); -#endif // MERGEICMPS_DOT_ON + MergedBlocks_ = mergeBlocks(std::move(Comparisons)); } -#ifdef MERGEICMPS_DOT_ON -void BCECmpChain::dump() const { - errs() << "digraph dag {\n"; - errs() << " graph [bgcolor=transparent];\n"; - errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n"; - errs() << " edge [color=black];\n"; - for (size_t I = 0; I < Comparisons_.size(); ++I) { - const auto &Comparison = Comparisons_[I]; - errs() << " \"" << I << "\" [label=\"%" - << Comparison.Lhs().Base()->getName() << " + " - << Comparison.Lhs().Offset << " == %" - << Comparison.Rhs().Base()->getName() << " + " - << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8) - << " bytes)\"];\n"; - const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB); - if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n"; - errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n"; - } - errs() << " \"Phi\" [label=\"Phi\"];\n"; - errs() << "}\n\n"; -} -#endif // MERGEICMPS_DOT_ON - namespace { // A class to compute the name of a set of merged basic blocks. @@ -661,47 +663,18 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, DomTreeUpdater &DTU) { - assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain"); - // First pass to check if there is at least one merge. If not, we don't do - // anything and we keep analysis passes intact. - const auto AtLeastOneMerged = [this]() { - for (size_t I = 1; I < Comparisons_.size(); ++I) { - if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) - return true; - } - return false; - }; - if (!AtLeastOneMerged()) - return false; - + assert(atLeastOneMerged() && "simplifying trivial BCECmpChain"); LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block " << EntryBlock_->getName() << "\n"); // Effectively merge blocks. We go in the reverse direction from the phi block // so that the next block is always available to branch to. - const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num, - BasicBlock *InsertBefore, - BasicBlock *Next) { - return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num), - InsertBefore, Next, Phi_, TLI, AA, DTU); - }; - int NumMerged = 1; + BasicBlock *InsertBefore = EntryBlock_; BasicBlock *NextCmpBlock = Phi_.getParent(); - for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; --I) { - if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) { - LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName() - << " into " << Comparisons_[I + 1].BB->getName() - << "\n"); - ++NumMerged; - } else { - NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock); - NumMerged = 1; - } + for (const auto &Blocks : reverse(MergedBlocks_)) { + InsertBefore = NextCmpBlock = mergeComparisons( + Blocks, InsertBefore, NextCmpBlock, Phi_, TLI, AA, DTU); } - // Insert the entry block for the new chain before the old entry block. - // If the old entry block was the function entry, this ensures that the new - // entry can become the function entry. - NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock); // Replace the original cmp chain with the new cmp chain by pointing all // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp @@ -729,13 +702,16 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, // Delete merged blocks. This also removes incoming values in phi. SmallVector<BasicBlock *, 16> DeadBlocks; - for (auto &Cmp : Comparisons_) { - LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n"); - DeadBlocks.push_back(Cmp.BB); + for (const auto &Blocks : MergedBlocks_) { + for (const BCECmpBlock &Block : Blocks) { + LLVM_DEBUG(dbgs() << "Deleting merged block " << Block.BB->getName() + << "\n"); + DeadBlocks.push_back(Block.BB); + } } DeleteDeadBlocks(DeadBlocks, &DTU); - Comparisons_.clear(); + MergedBlocks_.clear(); return true; } @@ -835,8 +811,8 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, if (Blocks.empty()) return false; BCECmpChain CmpChain(Blocks, Phi, AA); - if (CmpChain.size() < 2) { - LLVM_DEBUG(dbgs() << "skip: only one compare block\n"); + if (!CmpChain.atLeastOneMerged()) { + LLVM_DEBUG(dbgs() << "skip: nothing merged\n"); return false; } @@ -862,9 +838,9 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI, bool MadeChange = false; - for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { + for (BasicBlock &BB : llvm::drop_begin(F)) { // A Phi operation is always first in a basic block. - if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin())) + if (auto *const Phi = dyn_cast<PHINode>(&*BB.begin())) MadeChange |= processPhi(*Phi, TLI, AA, DTU); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 033fc168a67f..734532a6670c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -420,3 +420,12 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { PA.preserveSet<CFGAnalyses>(); return PA; } + +void MergedLoadStoreMotionPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<MergedLoadStoreMotionPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << "<"; + OS << (Options.SplitFooterBB ? "" : "no-") << "split-footer-bb"; + OS << ">"; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index ded5caf53b5a..6dca30d9876e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -282,8 +282,12 @@ NaryReassociatePass::matchAndReassociateMinOrMax(Instruction *I, m_Value(LHS), m_Value(RHS)); if (match(I, MinMaxMatcher)) { OrigSCEV = SE->getSCEV(I); - return dyn_cast_or_null<Instruction>( - tryReassociateMinOrMax(I, MinMaxMatcher, LHS, RHS)); + if (auto *NewMinMax = dyn_cast_or_null<Instruction>( + tryReassociateMinOrMax(I, MinMaxMatcher, LHS, RHS))) + return NewMinMax; + if (auto *NewMinMax = dyn_cast_or_null<Instruction>( + tryReassociateMinOrMax(I, MinMaxMatcher, RHS, LHS))) + return NewMinMax; } return nullptr; } @@ -596,58 +600,60 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I, Value *LHS, Value *RHS) { Value *A = nullptr, *B = nullptr; MaxMinT m_MaxMin(m_Value(A), m_Value(B)); - for (unsigned int i = 0; i < 2; ++i) { - if (!LHS->hasNUsesOrMore(3) && match(LHS, m_MaxMin)) { - const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); - const SCEV *RHSExpr = SE->getSCEV(RHS); - for (unsigned int j = 0; j < 2; ++j) { - if (j == 0) { - if (BExpr == RHSExpr) - continue; - // Transform 'I = (A op B) op RHS' to 'I = (A op RHS) op B' on the - // first iteration. - std::swap(BExpr, RHSExpr); - } else { - if (AExpr == RHSExpr) - continue; - // Transform 'I = (A op RHS) op B' 'I = (B op RHS) op A' on the second - // iteration. - std::swap(AExpr, RHSExpr); - } - - // The optimization is profitable only if LHS can be removed in the end. - // In other words LHS should be used (directly or indirectly) by I only. - if (llvm::any_of(LHS->users(), [&](auto *U) { - return U != I && !(U->hasOneUser() && *U->users().begin() == I); - })) - continue; - - SCEVExpander Expander(*SE, *DL, "nary-reassociate"); - SmallVector<const SCEV *, 2> Ops1{ BExpr, AExpr }; - const SCEVTypes SCEVType = convertToSCEVype(m_MaxMin); - const SCEV *R1Expr = SE->getMinMaxExpr(SCEVType, Ops1); - - Instruction *R1MinMax = findClosestMatchingDominator(R1Expr, I); - - if (!R1MinMax) - continue; - - LLVM_DEBUG(dbgs() << "NARY: Found common sub-expr: " << *R1MinMax - << "\n"); - - R1Expr = SE->getUnknown(R1MinMax); - SmallVector<const SCEV *, 2> Ops2{ RHSExpr, R1Expr }; - const SCEV *R2Expr = SE->getMinMaxExpr(SCEVType, Ops2); - - Value *NewMinMax = Expander.expandCodeFor(R2Expr, I->getType(), I); - NewMinMax->setName(Twine(I->getName()).concat(".nary")); - - LLVM_DEBUG(dbgs() << "NARY: Deleting: " << *I << "\n" - << "NARY: Inserting: " << *NewMinMax << "\n"); - return NewMinMax; - } - } - std::swap(LHS, RHS); + + if (LHS->hasNUsesOrMore(3) || + // The optimization is profitable only if LHS can be removed in the end. + // In other words LHS should be used (directly or indirectly) by I only. + llvm::any_of(LHS->users(), + [&](auto *U) { + return U != I && + !(U->hasOneUser() && *U->users().begin() == I); + }) || + !match(LHS, m_MaxMin)) + return nullptr; + + auto tryCombination = [&](Value *A, const SCEV *AExpr, Value *B, + const SCEV *BExpr, Value *C, + const SCEV *CExpr) -> Value * { + SmallVector<const SCEV *, 2> Ops1{BExpr, AExpr}; + const SCEVTypes SCEVType = convertToSCEVype(m_MaxMin); + const SCEV *R1Expr = SE->getMinMaxExpr(SCEVType, Ops1); + + Instruction *R1MinMax = findClosestMatchingDominator(R1Expr, I); + + if (!R1MinMax) + return nullptr; + + LLVM_DEBUG(dbgs() << "NARY: Found common sub-expr: " << *R1MinMax << "\n"); + + SmallVector<const SCEV *, 2> Ops2{SE->getUnknown(C), + SE->getUnknown(R1MinMax)}; + const SCEV *R2Expr = SE->getMinMaxExpr(SCEVType, Ops2); + + SCEVExpander Expander(*SE, *DL, "nary-reassociate"); + Value *NewMinMax = Expander.expandCodeFor(R2Expr, I->getType(), I); + NewMinMax->setName(Twine(I->getName()).concat(".nary")); + + LLVM_DEBUG(dbgs() << "NARY: Deleting: " << *I << "\n" + << "NARY: Inserting: " << *NewMinMax << "\n"); + return NewMinMax; + }; + + const SCEV *AExpr = SE->getSCEV(A); + const SCEV *BExpr = SE->getSCEV(B); + const SCEV *RHSExpr = SE->getSCEV(RHS); + + if (BExpr != RHSExpr) { + // Try (A op RHS) op B + if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr)) + return NewMinMax; + } + + if (AExpr != RHSExpr) { + // Try (RHS op B) op A + if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr)) + return NewMinMax; } + return nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp index a137d13c6ea0..91215cd19e2b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1194,9 +1194,10 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; - } else if (isa<GetElementPtrInst>(I)) { - Value *V = SimplifyGEPInst( - E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ); + } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) { + Value *V = SimplifyGEPInst(GEPI->getSourceElementType(), + ArrayRef<Value *>(E->op_begin(), E->op_end()), + GEPI->isInBounds(), SQ); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (AllConstant) { @@ -1818,7 +1819,7 @@ NewGVN::ExprResult NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { // See if we know something about the comparison itself, like it is the target // of an assume. auto *CmpPI = PredInfo->getPredicateInfoFor(I); - if (dyn_cast_or_null<PredicateAssume>(CmpPI)) + if (isa_and_nonnull<PredicateAssume>(CmpPI)) return ExprResult::some( createConstantExpression(ConstantInt::getTrue(CI->getType()))); @@ -3606,7 +3607,7 @@ void NewGVN::convertClassToDFSOrdered( // Skip uses in unreachable blocks, as we're going // to delete them. - if (ReachableBlocks.count(IBlock) == 0) + if (!ReachableBlocks.contains(IBlock)) continue; DomTreeNode *DomNode = DT->getNode(IBlock); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 7872c553b412..44027ccd92ca 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -82,7 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, // Add attribute "readnone" so that backend can use a native sqrt instruction // for this call. - Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); + Call->addFnAttr(Attribute::ReadNone); // Insert a FP compare instruction and use it as the CurrBB branch condition. Builder.SetInsertPoint(CurrBBTerm); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp index 888edc4d69a8..b0fb8daaba8f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -140,7 +140,7 @@ XorOpnd::XorOpnd(Value *V) { // view the operand as "V | 0" SymbolicPart = V; - ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits()); + ConstPart = APInt::getZero(V->getType()->getScalarSizeInBits()); isOr = true; } @@ -1279,10 +1279,10 @@ static Value *OptimizeAndOrXor(unsigned Opcode, /// be returned. static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, const APInt &ConstOpnd) { - if (ConstOpnd.isNullValue()) + if (ConstOpnd.isZero()) return nullptr; - if (ConstOpnd.isAllOnesValue()) + if (ConstOpnd.isAllOnes()) return Opnd; Instruction *I = BinaryOperator::CreateAnd( @@ -1304,7 +1304,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, // = ((x | c1) ^ c1) ^ (c1 ^ c2) // = (x & ~c1) ^ (c1 ^ c2) // It is useful only when c1 == c2. - if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue()) + if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isZero()) return false; if (!Opnd1->getValue()->hasOneUse()) @@ -1361,7 +1361,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt C3((~C1) ^ C2); // Do not increase code size! - if (!C3.isNullValue() && !C3.isAllOnesValue()) { + if (!C3.isZero() && !C3.isAllOnes()) { int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; if (NewInstNum > DeadInstNum) return false; @@ -1377,7 +1377,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt C3 = C1 ^ C2; // Do not increase code size - if (!C3.isNullValue() && !C3.isAllOnesValue()) { + if (!C3.isZero() && !C3.isAllOnes()) { int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; if (NewInstNum > DeadInstNum) return false; @@ -1468,8 +1468,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, Value *CV; // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" - if (!ConstOpnd.isNullValue() && - CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { + if (!ConstOpnd.isZero() && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { Changed = true; if (CV) *CurrOpnd = XorOpnd(CV); @@ -1510,7 +1509,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, ValueEntry VE(getRank(O.getValue()), O.getValue()); Ops.push_back(VE); } - if (!ConstOpnd.isNullValue()) { + if (!ConstOpnd.isZero()) { Value *C = ConstantInt::get(Ty, ConstOpnd); ValueEntry VE(getRank(C), C); Ops.push_back(VE); @@ -1519,7 +1518,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, if (Sz == 1) return Ops.back().Op; if (Sz == 0) { - assert(ConstOpnd.isNullValue()); + assert(ConstOpnd.isZero()); return ConstantInt::get(Ty, ConstOpnd); } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index bc0fecc972fc..2d3490b2d29e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -755,7 +755,7 @@ public: } bool operator==(const BDVState &Other) const { - return OriginalValue == OriginalValue && BaseValue == Other.BaseValue && + return OriginalValue == Other.OriginalValue && BaseValue == Other.BaseValue && Status == Other.Status; } @@ -910,7 +910,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { #ifndef NDEBUG VerifyStates(); LLVM_DEBUG(dbgs() << "States after initialization:\n"); - for (auto Pair : States) { + for (const auto &Pair : States) { LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif @@ -1002,7 +1002,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { #ifndef NDEBUG VerifyStates(); LLVM_DEBUG(dbgs() << "States after meet iteration:\n"); - for (auto Pair : States) { + for (const auto &Pair : States) { LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif @@ -1163,7 +1163,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // llvm::Value of the correct type (and still remain pure). // This will remove the need to add bitcasts. assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() && - "Sanity -- findBaseOrBDV should be pure!"); + "findBaseOrBDV should be pure!"); #endif } Value *Base = BlockToValue[InBB]; @@ -1377,11 +1377,11 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx, return AL; // Remove the readonly, readnone, and statepoint function attributes. - AttrBuilder FnAttrs = AL.getFnAttributes(); + AttrBuilder FnAttrs = AL.getFnAttrs(); for (auto Attr : FnAttrsToStrip) FnAttrs.removeAttribute(Attr); - for (Attribute A : AL.getFnAttributes()) { + for (Attribute A : AL.getFnAttrs()) { if (isStatepointDirectiveAttr(A)) FnAttrs.remove(A); } @@ -1533,9 +1533,8 @@ static StringRef getDeoptLowering(CallBase *Call) { // FIXME: Calls have a *really* confusing interface around attributes // with values. const AttributeList &CSAS = Call->getAttributes(); - if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering)) - return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering) - .getValueAsString(); + if (CSAS.hasFnAttr(DeoptLowering)) + return CSAS.getFnAttr(DeoptLowering).getValueAsString(); Function *F = Call->getCalledFunction(); assert(F && F->hasFnAttribute(DeoptLowering)); return F->getFnAttribute(DeoptLowering).getValueAsString(); @@ -1801,7 +1800,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name); GCResult->setAttributes( AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex, - Call->getAttributes().getRetAttributes())); + Call->getAttributes().getRetAttrs())); // We cannot RAUW or delete CS.getInstruction() because it could be in the // live set of some other safepoint, in which case that safepoint's @@ -1855,7 +1854,7 @@ makeStatepointExplicit(DominatorTree &DT, CallBase *Call, // It receives iterator to the statepoint gc relocates and emits a store to the // assigned location (via allocaMap) for the each one of them. It adds the // visited values into the visitedLiveValues set, which we will later use them -// for sanity checking. +// for validation checking. static void insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, DenseMap<Value *, AllocaInst *> &AllocaMap, @@ -2454,7 +2453,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, SmallVectorImpl<CallBase *> &ToUpdate, DefiningValueMapTy &DVCache) { #ifndef NDEBUG - // sanity check the input + // Validate the input std::set<CallBase *> Uniqued; Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); @@ -2620,9 +2619,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, // we just grab that. llvm::append_range(Live, Info.StatepointToken->gc_args()); #ifndef NDEBUG - // Do some basic sanity checks on our liveness results before performing - // relocation. Relocation can and will turn mistakes in liveness results - // into non-sensical code which is must harder to debug. + // Do some basic validation checking on our liveness results before + // performing relocation. Relocation can and will turn mistakes in liveness + // results into non-sensical code which is must harder to debug. // TODO: It would be nice to test consistency as well assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) && "statepoint must be reachable or liveness is meaningless"); @@ -2641,7 +2640,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, unique_unsorted(Live); #ifndef NDEBUG - // sanity check + // Validation check for (auto *Ptr : Live) assert(isHandledGCPointerType(Ptr->getType()) && "must be a gc pointer type"); @@ -2656,18 +2655,19 @@ template <typename AttrHolder> static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, unsigned Index) { AttrBuilder R; - if (AH.getDereferenceableBytes(Index)) + AttributeSet AS = AH.getAttributes().getAttributes(Index); + if (AS.getDereferenceableBytes()) R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, - AH.getDereferenceableBytes(Index))); - if (AH.getDereferenceableOrNullBytes(Index)) + AS.getDereferenceableBytes())); + if (AS.getDereferenceableOrNullBytes()) R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, - AH.getDereferenceableOrNullBytes(Index))); + AS.getDereferenceableOrNullBytes())); for (auto Attr : ParamAttrsToStrip) - if (AH.getAttributes().hasAttribute(Index, Attr)) + if (AS.hasAttribute(Attr)) R.addAttribute(Attr); if (!R.empty()) - AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R)); + AH.setAttributes(AH.getAttributes().removeAttributesAtIndex(Ctx, Index, R)); } static void stripNonValidAttributesFromPrototype(Function &F) { @@ -3016,7 +3016,7 @@ static SetVector<Value *> computeKillSet(BasicBlock *BB) { #ifndef NDEBUG /// Check that the items in 'Live' dominate 'TI'. This is used as a basic -/// sanity check for the liveness computation. +/// validation check for the liveness computation. static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live, Instruction *TI, bool TermOkay = false) { for (Value *V : Live) { @@ -3103,7 +3103,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F, } // while (!Worklist.empty()) #ifndef NDEBUG - // Sanity check our output against SSA properties. This helps catch any + // Verify our output against SSA properties. This helps catch any // missing kills during the above iteration. for (BasicBlock &BB : F) checkBasicSSA(DT, Data, BB); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp index b09f896d0157..28e00c873361 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -490,17 +490,17 @@ bool llvm::runIPSCCP( AttrBuilder AttributesToRemove; AttributesToRemove.addAttribute(Attribute::ArgMemOnly); AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); - F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove); + F.removeFnAttrs(AttributesToRemove); for (User *U : F.users()) { auto *CB = dyn_cast<CallBase>(U); if (!CB || CB->getCalledFunction() != &F) continue; - CB->removeAttributes(AttributeList::FunctionIndex, - AttributesToRemove); + CB->removeFnAttrs(AttributesToRemove); } } + MadeChanges |= ReplacedPointerArg; } SmallPtrSet<Value *, 32> InsertedValues; @@ -540,14 +540,13 @@ bool llvm::runIPSCCP( DTU.deleteBB(DeadBB); for (BasicBlock &BB : F) { - for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { - Instruction *Inst = &*BI++; - if (Solver.getPredicateInfoFor(Inst)) { - if (auto *II = dyn_cast<IntrinsicInst>(Inst)) { + for (Instruction &Inst : llvm::make_early_inc_range(BB)) { + if (Solver.getPredicateInfoFor(&Inst)) { + if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { if (II->getIntrinsicID() == Intrinsic::ssa_copy) { Value *Op = II->getOperand(0); - Inst->replaceAllUsesWith(Op); - Inst->eraseFromParent(); + Inst.replaceAllUsesWith(Op); + Inst.eraseFromParent(); } } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp index fe160d5415bd..31c8999c3724 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp @@ -122,7 +122,7 @@ namespace { class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter { std::string Prefix; - const Twine getNameWithPrefix(const Twine &Name) const { + Twine getNameWithPrefix(const Twine &Name) const { return Name.isTriviallyEmpty() ? Name : Prefix + Name; } @@ -1275,8 +1275,7 @@ static void speculatePHINodeLoads(PHINode &PN) { // Get the AA tags and alignment to use from one of the loads. It does not // matter which one we get and if any differ. - AAMDNodes AATags; - SomeLoad->getAAMetadata(AATags); + AAMDNodes AATags = SomeLoad->getAAMetadata(); Align Alignment = SomeLoad->getAlign(); // Rewrite all loads of the PN to use the new PHI. @@ -1330,14 +1329,21 @@ static void speculatePHINodeLoads(PHINode &PN) { /// %V = select i1 %cond, i32 %V1, i32 %V2 /// /// We can do this to a select if its only uses are loads and if the operand -/// to the select can be loaded unconditionally. +/// to the select can be loaded unconditionally. If found an intervening bitcast +/// with a single use of the load, allow the promotion. static bool isSafeSelectToSpeculate(SelectInst &SI) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); const DataLayout &DL = SI.getModule()->getDataLayout(); for (User *U : SI.users()) { - LoadInst *LI = dyn_cast<LoadInst>(U); + LoadInst *LI; + BitCastInst *BC = dyn_cast<BitCastInst>(U); + if (BC && BC->hasOneUse()) + LI = dyn_cast<LoadInst>(*BC->user_begin()); + else + LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; @@ -1363,13 +1369,27 @@ static void speculateSelectInstLoads(SelectInst &SI) { Value *FV = SI.getFalseValue(); // Replace the loads of the select with a select of two loads. while (!SI.use_empty()) { - LoadInst *LI = cast<LoadInst>(SI.user_back()); + LoadInst *LI; + BitCastInst *BC = dyn_cast<BitCastInst>(SI.user_back()); + if (BC) { + assert(BC->hasOneUse() && "Bitcast should have a single use."); + LI = cast<LoadInst>(BC->user_back()); + } else { + LI = cast<LoadInst>(SI.user_back()); + } + assert(LI->isSimple() && "We only speculate simple loads"); IRB.SetInsertPoint(LI); - LoadInst *TL = IRB.CreateLoad(LI->getType(), TV, + Value *NewTV = + BC ? IRB.CreateBitCast(TV, BC->getType(), TV->getName() + ".sroa.cast") + : TV; + Value *NewFV = + BC ? IRB.CreateBitCast(FV, BC->getType(), FV->getName() + ".sroa.cast") + : FV; + LoadInst *TL = IRB.CreateLoad(LI->getType(), NewTV, LI->getName() + ".sroa.speculate.load.true"); - LoadInst *FL = IRB.CreateLoad(LI->getType(), FV, + LoadInst *FL = IRB.CreateLoad(LI->getType(), NewFV, LI->getName() + ".sroa.speculate.load.false"); NumLoadsSpeculated += 2; @@ -1377,8 +1397,7 @@ static void speculateSelectInstLoads(SelectInst &SI) { TL->setAlignment(LI->getAlign()); FL->setAlignment(LI->getAlign()); - AAMDNodes Tags; - LI->getAAMetadata(Tags); + AAMDNodes Tags = LI->getAAMetadata(); if (Tags) { TL->setAAMetadata(Tags); FL->setAAMetadata(Tags); @@ -1390,6 +1409,8 @@ static void speculateSelectInstLoads(SelectInst &SI) { LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n"); LI->replaceAllUsesWith(V); LI->eraseFromParent(); + if (BC) + BC->eraseFromParent(); } SI.eraseFromParent(); } @@ -1462,76 +1483,6 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, return buildGEP(IRB, BasePtr, Indices, NamePrefix); } -/// Recursively compute indices for a natural GEP. -/// -/// This is the recursive step for getNaturalGEPWithOffset that walks down the -/// element types adding appropriate indices for the GEP. -static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, - Value *Ptr, Type *Ty, APInt &Offset, - Type *TargetTy, - SmallVectorImpl<Value *> &Indices, - const Twine &NamePrefix) { - if (Offset == 0) - return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, - NamePrefix); - - // We can't recurse through pointer types. - if (Ty->isPointerTy()) - return nullptr; - - // We try to analyze GEPs over vectors here, but note that these GEPs are - // extremely poorly defined currently. The long-term goal is to remove GEPing - // over a vector from the IR completely. - if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { - unsigned ElementSizeInBits = - DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize(); - if (ElementSizeInBits % 8 != 0) { - // GEPs over non-multiple of 8 size vector elements are invalid. - return nullptr; - } - APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); - APInt NumSkippedElements = Offset.sdiv(ElementSize); - if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements())) - return nullptr; - Offset -= NumSkippedElements * ElementSize; - Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), - Offset, TargetTy, Indices, NamePrefix); - } - - if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { - Type *ElementTy = ArrTy->getElementType(); - APInt ElementSize(Offset.getBitWidth(), - DL.getTypeAllocSize(ElementTy).getFixedSize()); - APInt NumSkippedElements = Offset.sdiv(ElementSize); - if (NumSkippedElements.ugt(ArrTy->getNumElements())) - return nullptr; - - Offset -= NumSkippedElements * ElementSize; - Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices, NamePrefix); - } - - StructType *STy = dyn_cast<StructType>(Ty); - if (!STy) - return nullptr; - - const StructLayout *SL = DL.getStructLayout(STy); - uint64_t StructOffset = Offset.getZExtValue(); - if (StructOffset >= SL->getSizeInBytes()) - return nullptr; - unsigned Index = SL->getElementContainingOffset(StructOffset); - Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); - Type *ElementTy = STy->getElementType(Index); - if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize())) - return nullptr; // The offset points into alignment padding. - - Indices.push_back(IRB.getInt32(Index)); - return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices, NamePrefix); -} - /// Get a natural GEP from a base pointer to a particular offset and /// resulting in a particular type. /// @@ -1556,18 +1507,15 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Type *ElementTy = Ty->getElementType(); if (!ElementTy->isSized()) return nullptr; // We can't GEP through an unsized element. - if (isa<ScalableVectorType>(ElementTy)) + + SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(ElementTy, Offset); + if (Offset != 0) return nullptr; - APInt ElementSize(Offset.getBitWidth(), - DL.getTypeAllocSize(ElementTy).getFixedSize()); - if (ElementSize == 0) - return nullptr; // Zero-length arrays can't help us build a natural GEP. - APInt NumSkippedElements = Offset.sdiv(ElementSize); - - Offset -= NumSkippedElements * ElementSize; - Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices, NamePrefix); + + for (const APInt &Index : IntIndices) + Indices.push_back(IRB.getInt(Index)); + return getNaturalGEPWithType(IRB, DL, Ptr, ElementTy, TargetTy, Indices, + NamePrefix); } /// Compute an adjusted pointer from Ptr by Offset bytes where the @@ -1588,6 +1536,15 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix) { + // Create i8 GEP for opaque pointers. + if (Ptr->getType()->isOpaquePointerTy()) { + if (Offset != 0) + Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset), + NamePrefix + "sroa_idx"); + return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy, + NamePrefix + "sroa_cast"); + } + // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet<Value *, 4> Visited; @@ -1851,13 +1808,13 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; - } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { - // Disable vector promotion when there are loads or stores of an FCA. - return false; } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { if (LI->isVolatile()) return false; Type *LTy = LI->getType(); + // Disable vector promotion when there are loads or stores of an FCA. + if (LTy->isStructTy()) + return false; if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(LTy->isIntegerTy()); LTy = SplitIntTy; @@ -1868,6 +1825,9 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, if (SI->isVolatile()) return false; Type *STy = SI->getValueOperand()->getType(); + // Disable vector promotion when there are loads or stores of an FCA. + if (STy->isStructTy()) + return false; if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(STy->isIntegerTy()); STy = SplitIntTy; @@ -2282,7 +2242,7 @@ class llvm::sroa::AllocaSliceRewriter const DataLayout &DL; AllocaSlices &AS; - SROA &Pass; + SROAPass &Pass; AllocaInst &OldAI, &NewAI; const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; Type *NewAllocaTy; @@ -2330,7 +2290,7 @@ class llvm::sroa::AllocaSliceRewriter IRBuilderTy IRB; public: - AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, + AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass, AllocaInst &OldAI, AllocaInst &NewAI, uint64_t NewAllocaBeginOffset, uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, @@ -2510,8 +2470,7 @@ private: Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); - AAMDNodes AATags; - LI.getAAMetadata(AATags); + AAMDNodes AATags = LI.getAAMetadata(); unsigned AS = LI.getPointerAddressSpace(); @@ -2675,9 +2634,7 @@ private: Value *OldOp = SI.getOperand(1); assert(OldOp == OldPtr); - AAMDNodes AATags; - SI.getAAMetadata(AATags); - + AAMDNodes AATags = SI.getAAMetadata(); Value *V = SI.getValueOperand(); // Strip all inbounds GEPs and pointer casts to try to dig out any root @@ -2743,7 +2700,9 @@ private: deleteIfTriviallyDead(OldOp); LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n"); - return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); + return NewSI->getPointerOperand() == &NewAI && + NewSI->getValueOperand()->getType() == NewAllocaTy && + !SI.isVolatile(); } /// Compute an integer value from splatting an i8 across the given @@ -2784,8 +2743,7 @@ private: LLVM_DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getRawDest() == OldPtr); - AAMDNodes AATags; - II.getAAMetadata(AATags); + AAMDNodes AATags = II.getAAMetadata(); // If the memset has a variable size, it cannot be split, just adjust the // pointer to the new alloca. @@ -2913,8 +2871,7 @@ private: LLVM_DEBUG(dbgs() << " original: " << II << "\n"); - AAMDNodes AATags; - II.getAAMetadata(AATags); + AAMDNodes AATags = II.getAAMetadata(); bool IsDest = &II.getRawDestUse() == OldUse; assert((IsDest && II.getRawDest() == OldPtr) || @@ -3421,9 +3378,7 @@ private: // We have an aggregate being loaded, split it apart. LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); - AAMDNodes AATags; - LI.getAAMetadata(AATags); - LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags, + LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(), getAdjustedAlignment(&LI, 0), DL); Value *V = UndefValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); @@ -3474,9 +3429,7 @@ private: // We have an aggregate being stored, split it apart. LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); - AAMDNodes AATags; - SI.getAAMetadata(AATags); - StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags, + StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), getAdjustedAlignment(&SI, 0), DL); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); Visited.erase(&SI); @@ -3802,7 +3755,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, /// there all along. /// /// \returns true if any changes are made. -bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { +bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n"); // Track the loads and stores which are candidates for pre-splitting here, in @@ -4282,8 +4235,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - Partition &P) { +AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS, + Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. @@ -4434,7 +4387,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, /// Walks the slices of an alloca and form partitions based on them, /// rewriting each of their uses. -bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { +bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { if (AS.begin() == AS.end()) return false; @@ -4605,7 +4558,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { } /// Clobber a use with undef, deleting the used value if it becomes dead. -void SROA::clobberUse(Use &U) { +void SROAPass::clobberUse(Use &U) { Value *OldV = U; // Replace the use with an undef value. U = UndefValue::get(OldV->getType()); @@ -4624,7 +4577,7 @@ void SROA::clobberUse(Use &U) { /// This analyzes the alloca to ensure we can reason about it, builds /// the slices of the alloca, and then hands it off to be split and /// rewritten as needed. -bool SROA::runOnAlloca(AllocaInst &AI) { +bool SROAPass::runOnAlloca(AllocaInst &AI) { LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); ++NumAllocasAnalyzed; @@ -4698,7 +4651,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) { /// /// We also record the alloca instructions deleted here so that they aren't /// subsequently handed to mem2reg to promote. -bool SROA::deleteDeadInstructions( +bool SROAPass::deleteDeadInstructions( SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { bool Changed = false; while (!DeadInsts.empty()) { @@ -4737,7 +4690,7 @@ bool SROA::deleteDeadInstructions( /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. /// This function returns whether any promotion occurred. -bool SROA::promoteAllocas(Function &F) { +bool SROAPass::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; @@ -4749,8 +4702,8 @@ bool SROA::promoteAllocas(Function &F) { return true; } -PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC) { +PreservedAnalyses SROAPass::runImpl(Function &F, DominatorTree &RunDT, + AssumptionCache &RunAC) { LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; @@ -4804,7 +4757,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, return PA; } -PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { +PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) { return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F), AM.getResult<AssumptionAnalysis>(F)); } @@ -4815,7 +4768,7 @@ PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { /// SROA pass. class llvm::sroa::SROALegacyPass : public FunctionPass { /// The SROA implementation. - SROA Impl; + SROAPass Impl; public: static char ID; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index ca288a533f46..1284bae820a4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -873,13 +873,11 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI, auto &DL = F.getParent()->getDataLayout(); while (MadeChange) { MadeChange = false; - for (Function::iterator I = F.begin(); I != F.end();) { - BasicBlock *BB = &*I++; + for (BasicBlock &BB : llvm::make_early_inc_range(F)) { bool ModifiedDTOnIteration = false; - MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration, TTI, DL, + MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration, TTI, DL, DTU.hasValue() ? DTU.getPointer() : nullptr); - // Restart BB iteration if the dominator tree of the Function was changed if (ModifiedDTOnIteration) break; @@ -933,7 +931,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, if (II) { // The scalarization code below does not work for scalable vectors. if (isa<ScalableVectorType>(II->getType()) || - any_of(II->arg_operands(), + any_of(II->args(), [](Value *V) { return isa<ScalableVectorType>(V->getType()); })) return false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 8ef6b69673be..6b7419abe1d1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -66,6 +66,15 @@ static cl::opt<bool> namespace { +BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { + BasicBlock *BB = Itr->getParent(); + if (isa<PHINode>(Itr)) + Itr = BB->getFirstInsertionPt(); + if (Itr != BB->end()) + Itr = skipDebugIntrinsics(Itr); + return Itr; +} + // Used to store the scattered form of a vector. using ValueVector = SmallVector<Value *, 8>; @@ -371,10 +380,11 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { return Scatterer(Point->getParent(), Point->getIterator(), UndefValue::get(V->getType())); // Put the scattered form of an instruction directly after the - // instruction. + // instruction, skipping over PHI nodes and debug intrinsics. BasicBlock *BB = VOp->getParent(); - return Scatterer(BB, std::next(BasicBlock::iterator(VOp)), - V, &Scattered[V]); + return Scatterer( + BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V, + &Scattered[V]); } // In the fallback case, just put the scattered before Point and // keep the result local to Point. @@ -530,7 +540,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { return false; unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); - unsigned NumArgs = CI.getNumArgOperands(); + unsigned NumArgs = CI.arg_size(); ValueVector ScalarOperands(NumArgs); SmallVector<Scatterer, 8> Scattered(NumArgs); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index f216956406b6..ffa2f9adb978 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1164,8 +1164,11 @@ bool SeparateConstOffsetFromGEP::run(Function &F) { DL = &F.getParent()->getDataLayout(); bool Changed = false; for (BasicBlock &B : F) { - for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;) - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) + if (!DT->isReachableFromEntry(&B)) + continue; + + for (Instruction &I : llvm::make_early_inc_range(B)) + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) Changed |= splitGEP(GEP); // No need to split GEP ConstantExprs because all its indices are constant // already. @@ -1258,10 +1261,8 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { DominatingSubs.clear(); for (const auto Node : depth_first(DT)) { BasicBlock *BB = Node->getBlock(); - for (auto I = BB->begin(); I != BB->end(); ) { - Instruction *Cur = &*I++; - Changed |= reuniteExts(Cur); - } + for (Instruction &I : llvm::make_early_inc_range(*BB)) + Changed |= reuniteExts(&I); } return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index b1c105258027..a27da047bfd3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -49,7 +50,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" @@ -81,6 +81,7 @@ static cl::opt<bool> EnableNonTrivialUnswitch( static cl::opt<int> UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, + cl::ZeroOrMore, cl::desc("The cost threshold for unswitching a loop.")); static cl::opt<bool> EnableUnswitchCostMultiplier( @@ -108,6 +109,10 @@ static cl::opt<unsigned> cl::desc("Max number of memory uses to explore during " "partial unswitching analysis"), cl::init(100), cl::Hidden); +static cl::opt<bool> FreezeLoopUnswitchCond( + "freeze-loop-unswitch-cond", cl::init(false), cl::Hidden, + cl::desc("If enabled, the freeze instruction will be added to condition " + "of loop unswitch to prevent miscompilation.")); /// Collect all of the loop invariant input values transitively used by the /// homogeneous instruction graph from a given root. @@ -195,15 +200,15 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, /// Copy a set of loop invariant values \p ToDuplicate and insert them at the /// end of \p BB and conditionally branch on the copied condition. We only /// branch on a single value. -static void buildPartialUnswitchConditionalBranch(BasicBlock &BB, - ArrayRef<Value *> Invariants, - bool Direction, - BasicBlock &UnswitchedSucc, - BasicBlock &NormalSucc) { +static void buildPartialUnswitchConditionalBranch( + BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, + BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) { IRBuilder<> IRB(&BB); Value *Cond = Direction ? IRB.CreateOr(Invariants) : IRB.CreateAnd(Invariants); + if (InsertFreeze) + Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr"); IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc); } @@ -564,7 +569,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the" " condition!"); buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection, - *UnswitchedBB, *NewPH); + *UnswitchedBB, *NewPH, false); } // Update the dominator tree with the added edge. @@ -2123,6 +2128,13 @@ static void unswitchNontrivialInvariants( SE->forgetTopmostLoop(&L); } + bool InsertFreeze = false; + if (FreezeLoopUnswitchCond) { + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(&L); + InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L); + } + // If the edge from this terminator to a successor dominates that successor, // store a map from each block in its dominator subtree to it. This lets us // tell when cloning for a particular successor if a block is dominated by @@ -2197,6 +2209,11 @@ static void unswitchNontrivialInvariants( BasicBlock *ClonedPH = ClonedPHs.begin()->second; BI->setSuccessor(ClonedSucc, ClonedPH); BI->setSuccessor(1 - ClonedSucc, LoopPH); + if (InsertFreeze) { + auto Cond = BI->getCondition(); + if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT)) + BI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", BI)); + } DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); } else { assert(SI && "Must either be a branch or switch!"); @@ -2211,6 +2228,11 @@ static void unswitchNontrivialInvariants( else Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); + if (InsertFreeze) { + auto Cond = SI->getCondition(); + if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, SI, &DT)) + SI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", SI)); + } // We need to use the set to populate domtree updates as even when there // are multiple cases pointing at the same successor we only want to // remove and insert one edge in the domtree. @@ -2291,7 +2313,7 @@ static void unswitchNontrivialInvariants( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); else buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, - *ClonedPH, *LoopPH); + *ClonedPH, *LoopPH, InsertFreeze); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); if (MSSAU) { @@ -2370,7 +2392,9 @@ static void unswitchNontrivialInvariants( ConstantInt *ContinueReplacement = Direction ? ConstantInt::getFalse(BI->getContext()) : ConstantInt::getTrue(BI->getContext()); - for (Value *Invariant : Invariants) + for (Value *Invariant : Invariants) { + assert(!isa<Constant>(Invariant) && + "Should not be replacing constant values!"); // Use make_early_inc_range here as set invalidates the iterator. for (Use &U : llvm::make_early_inc_range(Invariant->uses())) { Instruction *UserI = dyn_cast<Instruction>(U.getUser()); @@ -2385,6 +2409,7 @@ static void unswitchNontrivialInvariants( DT.dominates(ClonedPH, UserI->getParent())) U.set(UnswitchedReplacement); } + } } // We can change which blocks are exit blocks of all the cloned sibling @@ -2727,6 +2752,9 @@ static bool unswitchBestCondition( Cond = CondNext; BI->setCondition(Cond); + if (isa<Constant>(Cond)) + continue; + if (L.isLoopInvariant(BI->getCondition())) { UnswitchCandidates.push_back({BI, {BI->getCondition()}}); continue; @@ -3121,6 +3149,17 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, return PA; } +void SimpleLoopUnswitchPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + + OS << "<"; + OS << (NonTrivial ? "" : "no-") << "nontrivial;"; + OS << (Trivial ? "" : "no-") << "trivial"; + OS << ">"; +} + namespace { class SimpleLoopUnswitchLegacyPass : public LoopPass { @@ -3140,10 +3179,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - if (EnableMSSALoopDependency) { - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addRequired<MemorySSAWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); getLoopAnalysisUsage(AU); } }; @@ -3164,12 +3201,8 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - MemorySSA *MSSA = nullptr; - Optional<MemorySSAUpdater> MSSAU; - if (EnableMSSALoopDependency) { - MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); - } + MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); + MemorySSAUpdater MSSAU(MSSA); auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; @@ -3197,15 +3230,13 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { LPM.markLoopAsDeleted(L); }; - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); - bool Changed = - unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, UnswitchCB, SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, - DestroyLoopCB); + bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, + UnswitchCB, SE, &MSSAU, DestroyLoopCB); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); // Historically this pass has had issues with the dominator tree so verify it diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 09d59b0e884a..86d3620c312e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -224,7 +224,11 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, SmallVector<WeakVH, 16> LoopHeaders(UniqueLoopHeaders.begin(), UniqueLoopHeaders.end()); + unsigned IterCnt = 0; + (void)IterCnt; while (LocalChange) { + assert(IterCnt++ < 1000 && + "Sanity: iterative simplification didn't converge!"); LocalChange = false; // Loop over all of the basic blocks and remove them if they are unneeded. @@ -319,6 +323,21 @@ SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) applyCommandLineOverridesToOptions(Options); } +void SimplifyCFGPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<SimplifyCFGPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << "<"; + OS << "bonus-inst-threshold=" << Options.BonusInstThreshold << ";"; + OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;"; + OS << (Options.ConvertSwitchToLookupTable ? "" : "no-") + << "switch-to-lookup;"; + OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;"; + OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;"; + OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts"; + OS << ">"; +} + PreservedAnalyses SimplifyCFGPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TTI = AM.getResult<TargetIRAnalysis>(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index dfa30418ea01..06169a7834f6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -268,7 +268,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) { return all_of(DVI->location_ops(), [&NotHoisted](Value *V) { if (const auto *I = dyn_cast_or_null<Instruction>(V)) { - if (NotHoisted.count(I) == 0) + if (!NotHoisted.contains(I)) return true; } return false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 20b8b982e14b..b47378808216 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -607,7 +607,7 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, if (IndexOffset == 1) return C.Stride; // Common case 2: if (i' - i) is -1, Bump = -S. - if (IndexOffset.isAllOnesValue()) + if (IndexOffset.isAllOnes()) return Builder.CreateNeg(C.Stride); // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may @@ -620,7 +620,7 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2()); return Builder.CreateShl(ExtendedStride, Exponent); } - if ((-IndexOffset).isPowerOf2()) { + if (IndexOffset.isNegatedPowerOf2()) { // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i). ConstantInt *Exponent = ConstantInt::get(DeltaType, (-IndexOffset).logBase2()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 846a9321f53e..3bcf92e28a21 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -262,7 +262,7 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) { // Note that this runs whether we know an alloca has escaped or not. If // it has, then we can't trust Tracker.AllocaUsers to be accurate. bool SafeToTail = true; - for (auto &Arg : CI->arg_operands()) { + for (auto &Arg : CI->args()) { if (isa<Constant>(Arg.getUser())) continue; if (Argument *A = dyn_cast<Argument>(Arg.getUser())) @@ -584,8 +584,8 @@ void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) { // call instruction into the newly created temporarily variable. void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI, int OpndIdx) { - PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType()); - Type *AggTy = ArgTy->getElementType(); + Type *AggTy = CI->getParamByValType(OpndIdx); + assert(AggTy); const DataLayout &DL = F.getParent()->getDataLayout(); // Get alignment of byVal operand. @@ -611,8 +611,8 @@ void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI, // into the corresponding function argument location. void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments( CallInst *CI, int OpndIdx) { - PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType()); - Type *AggTy = ArgTy->getElementType(); + Type *AggTy = CI->getParamByValType(OpndIdx); + assert(AggTy); const DataLayout &DL = F.getParent()->getDataLayout(); // Get alignment of byVal operand. @@ -667,7 +667,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) { createTailRecurseLoopHeader(CI); // Copy values of ByVal operands into local temporarily variables. - for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) { + for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) { if (CI->isByValArgument(I)) copyByValueOperandIntoLocalTemp(CI, I); } @@ -675,7 +675,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) { // Ok, now that we know we have a pseudo-entry block WITH all of the // required PHI nodes, add entries into the PHI node for the actual // parameters passed into the tail-recursive call. - for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) { + for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) { if (CI->isByValArgument(I)) { copyLocalTempOfByValueOperandIntoArguments(CI, I); ArgumentPHIs[I]->addIncoming(F.getArg(I), BB); |