diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2024-02-07 14:37:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-02-07 14:37:28 +0000 |
commit | 4fdf604ba667503ae582304cebdd3df426778a6b (patch) | |
tree | aae65a5d1b0a1b1acd9389fc36a0baf5fc1918c7 /llvm/lib/Transforms | |
parent | 2d835ae8657273e3aa8b9ef3201fb8df5563af9d (diff) |
Vendor import of llvm-project branch release/18.x llvmorg-18.1.0-rc2-0-gc6c86965d967.vendor/llvm-project/llvmorg-18.1.0-rc2-0-gc6c86965d967
Diffstat (limited to 'llvm/lib/Transforms')
6 files changed, 82 insertions, 18 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index bb2a77daa60a..1254a050027a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1032,7 +1032,8 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) { // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. bool IsLoadCSE = false; - if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) { + BatchAAResults BatchAA(*AA); + if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) { if (IsLoadCSE) combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index a8a5f9831e15..79873a9b4cbb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -802,6 +802,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return InsertNewInstWith(LShr, I->getIterator()); } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one. Known.One |= HighBits; + // SignBits may be out-of-sync with Known.countMinSignBits(). Mask out + // high bits of Known.Zero to avoid conflicts. + Known.Zero &= ~HighBits; } } else { computeKnownBits(I, Known, Depth, CxtI); diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 8f09569d0d9c..7b672e89b67a 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1061,11 +1061,16 @@ void State::addInfoFor(BasicBlock &BB) { FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I))); break; // Enqueue the intrinsics to add extra info. - case Intrinsic::abs: case Intrinsic::umin: case Intrinsic::umax: case Intrinsic::smin: case Intrinsic::smax: + // TODO: Check if it is possible to instead only added the min/max facts + // when simplifying uses of the min/max intrinsics. + if (!isGuaranteedNotToBePoison(&I)) + break; + [[fallthrough]]; + case Intrinsic::abs: WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I)); break; } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 8603c5cf9c02..87c01ead634f 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1260,8 +1260,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // the entry to its block. BasicBlock::iterator BBIt(LoadI); bool IsLoadCSE; + BatchAAResults BatchAA(*AA); + // The dominator tree is updated lazily and may not be valid at this point. + BatchAA.disableDominatorTree(); if (Value *AvailableVal = FindAvailableLoadedValue( - LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { + LoadI, LoadBB, BBIt, DefMaxInstsToScan, &BatchAA, &IsLoadCSE)) { // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. @@ -1322,9 +1325,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB), LocationSize::precise(DL.getTypeStoreSize(AccessTy)), AATags); - PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(), - PredBB, BBIt, DefMaxInstsToScan, - AA, &IsLoadCSE, &NumScanedInst); + PredAvailable = findAvailablePtrLoadStore( + Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan, + &BatchAA, &IsLoadCSE, &NumScanedInst); // If PredBB has a single predecessor, continue scanning through the // single predecessor. @@ -1336,7 +1339,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { BBIt = SinglePredBB->end(); PredAvailable = findAvailablePtrLoadStore( Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt, - (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, + (DefMaxInstsToScan - NumScanedInst), &BatchAA, &IsLoadCSE, &NumScanedInst); } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6ca93e15719f..dd596c567cd4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1957,6 +1957,8 @@ class GeneratedRTChecks { bool CostTooHigh = false; const bool AddBranchWeights; + Loop *OuterLoop = nullptr; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, const DataLayout &DL, @@ -2053,6 +2055,9 @@ public: DT->eraseNode(SCEVCheckBlock); LI->removeBlock(SCEVCheckBlock); } + + // Outer loop is used as part of the later cost calculations. + OuterLoop = L->getParentLoop(); } InstructionCost getCost() { @@ -2076,16 +2081,61 @@ public: LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } - if (MemCheckBlock) + if (MemCheckBlock) { + InstructionCost MemCheckCost = 0; for (Instruction &I : *MemCheckBlock) { if (MemCheckBlock->getTerminator() == &I) continue; InstructionCost C = TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); - RTCheckCost += C; + MemCheckCost += C; } + // If the runtime memory checks are being created inside an outer loop + // we should find out if these checks are outer loop invariant. If so, + // the checks will likely be hoisted out and so the effective cost will + // reduce according to the outer loop trip count. + if (OuterLoop) { + ScalarEvolution *SE = MemCheckExp.getSE(); + // TODO: If profitable, we could refine this further by analysing every + // individual memory check, since there could be a mixture of loop + // variant and invariant checks that mean the final condition is + // variant. + const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); + if (SE->isLoopInvariant(Cond, OuterLoop)) { + // It seems reasonable to assume that we can reduce the effective + // cost of the checks even when we know nothing about the trip + // count. Assume that the outer loop executes at least twice. + unsigned BestTripCount = 2; + + // If exact trip count is known use that. + if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) + BestTripCount = SmallTC; + else if (LoopVectorizeWithBlockFrequency) { + // Else use profile data if available. + if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) + BestTripCount = *EstimatedTC; + } + + InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; + + // Let's ensure the cost is always at least 1. + NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), + (InstructionCost::CostType)1); + + LLVM_DEBUG(dbgs() + << "We expect runtime memory checks to be hoisted " + << "out of the outer loop. Cost reduced from " + << MemCheckCost << " to " << NewMemCheckCost << '\n'); + + MemCheckCost = NewMemCheckCost; + } + } + + RTCheckCost += MemCheckCost; + } + if (SCEVCheckBlock || MemCheckBlock) LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost << "\n"); @@ -2144,8 +2194,8 @@ public: BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); // Create new preheader for vector loop. - if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) - PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); @@ -2179,8 +2229,8 @@ public: DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); - if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) - PL->addBasicBlockToLoop(MemCheckBlock, *LI); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bbeb5da2cfec..ae2fc522ba40 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -597,13 +597,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. - // Some vectorized function variants may also take a scalar argument, - // e.g. linear parameters for pointers. Value *Arg; - if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || - (UseIntrinsic && - isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) + if (UseIntrinsic && + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), VPIteration(0, 0)); + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. This needs to be the scalar value + // from the start of the respective part when interleaving. + else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) + Arg = State.get(I.value(), VPIteration(Part, 0)); else Arg = State.get(I.value(), Part); if (UseIntrinsic && |