aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2024-02-07 14:37:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-02-07 14:37:28 +0000
commit4fdf604ba667503ae582304cebdd3df426778a6b (patch)
treeaae65a5d1b0a1b1acd9389fc36a0baf5fc1918c7 /llvm/lib/Transforms
parent2d835ae8657273e3aa8b9ef3201fb8df5563af9d (diff)
Vendor import of llvm-project branch release/18.x llvmorg-18.1.0-rc2-0-gc6c86965d967.vendor/llvm-project/llvmorg-18.1.0-rc2-0-gc6c86965d967
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp7
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp13
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp62
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp12
6 files changed, 82 insertions, 18 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index bb2a77daa60a..1254a050027a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1032,7 +1032,8 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
// where there are several consecutive memory accesses to the same location,
// separated by a few arithmetic operations.
bool IsLoadCSE = false;
- if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {
+ BatchAAResults BatchAA(*AA);
+ if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
if (IsLoadCSE)
combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a8a5f9831e15..79873a9b4cbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -802,6 +802,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
return InsertNewInstWith(LShr, I->getIterator());
} else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
Known.One |= HighBits;
+ // SignBits may be out-of-sync with Known.countMinSignBits(). Mask out
+ // high bits of Known.Zero to avoid conflicts.
+ Known.Zero &= ~HighBits;
}
} else {
computeKnownBits(I, Known, Depth, CxtI);
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 8f09569d0d9c..7b672e89b67a 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1061,11 +1061,16 @@ void State::addInfoFor(BasicBlock &BB) {
FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
break;
// Enqueue the intrinsics to add extra info.
- case Intrinsic::abs:
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
case Intrinsic::smax:
+ // TODO: Check if it is possible to instead only added the min/max facts
+ // when simplifying uses of the min/max intrinsics.
+ if (!isGuaranteedNotToBePoison(&I))
+ break;
+ [[fallthrough]];
+ case Intrinsic::abs:
WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
break;
}
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 8603c5cf9c02..87c01ead634f 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1260,8 +1260,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// the entry to its block.
BasicBlock::iterator BBIt(LoadI);
bool IsLoadCSE;
+ BatchAAResults BatchAA(*AA);
+ // The dominator tree is updated lazily and may not be valid at this point.
+ BatchAA.disableDominatorTree();
if (Value *AvailableVal = FindAvailableLoadedValue(
- LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ LoadI, LoadBB, BBIt, DefMaxInstsToScan, &BatchAA, &IsLoadCSE)) {
// If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
@@ -1322,9 +1325,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB),
LocationSize::precise(DL.getTypeStoreSize(AccessTy)),
AATags);
- PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(),
- PredBB, BBIt, DefMaxInstsToScan,
- AA, &IsLoadCSE, &NumScanedInst);
+ PredAvailable = findAvailablePtrLoadStore(
+ Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+ &BatchAA, &IsLoadCSE, &NumScanedInst);
// If PredBB has a single predecessor, continue scanning through the
// single predecessor.
@@ -1336,7 +1339,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
BBIt = SinglePredBB->end();
PredAvailable = findAvailablePtrLoadStore(
Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt,
- (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+ (DefMaxInstsToScan - NumScanedInst), &BatchAA, &IsLoadCSE,
&NumScanedInst);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca93e15719f..dd596c567cd4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
bool CostTooHigh = false;
const bool AddBranchWeights;
+ Loop *OuterLoop = nullptr;
+
public:
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ public:
DT->eraseNode(SCEVCheckBlock);
LI->removeBlock(SCEVCheckBlock);
}
+
+ // Outer loop is used as part of the later cost calculations.
+ OuterLoop = L->getParentLoop();
}
InstructionCost getCost() {
@@ -2076,16 +2081,61 @@ public:
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
- if (MemCheckBlock)
+ if (MemCheckBlock) {
+ InstructionCost MemCheckCost = 0;
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
InstructionCost C =
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
- RTCheckCost += C;
+ MemCheckCost += C;
}
+ // If the runtime memory checks are being created inside an outer loop
+ // we should find out if these checks are outer loop invariant. If so,
+ // the checks will likely be hoisted out and so the effective cost will
+ // reduce according to the outer loop trip count.
+ if (OuterLoop) {
+ ScalarEvolution *SE = MemCheckExp.getSE();
+ // TODO: If profitable, we could refine this further by analysing every
+ // individual memory check, since there could be a mixture of loop
+ // variant and invariant checks that mean the final condition is
+ // variant.
+ const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+ if (SE->isLoopInvariant(Cond, OuterLoop)) {
+ // It seems reasonable to assume that we can reduce the effective
+ // cost of the checks even when we know nothing about the trip
+ // count. Assume that the outer loop executes at least twice.
+ unsigned BestTripCount = 2;
+
+ // If exact trip count is known use that.
+ if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
+ BestTripCount = SmallTC;
+ else if (LoopVectorizeWithBlockFrequency) {
+ // Else use profile data if available.
+ if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
+ BestTripCount = *EstimatedTC;
+ }
+
+ InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
+
+ // Let's ensure the cost is always at least 1.
+ NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
+ (InstructionCost::CostType)1);
+
+ LLVM_DEBUG(dbgs()
+ << "We expect runtime memory checks to be hoisted "
+ << "out of the outer loop. Cost reduced from "
+ << MemCheckCost << " to " << NewMemCheckCost << '\n');
+
+ MemCheckCost = NewMemCheckCost;
+ }
+ }
+
+ RTCheckCost += MemCheckCost;
+ }
+
if (SCEVCheckBlock || MemCheckBlock)
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
<< "\n");
@@ -2144,8 +2194,8 @@ public:
BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
// Create new preheader for vector loop.
- if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
- PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
+ if (OuterLoop)
+ OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
SCEVCheckBlock->getTerminator()->eraseFromParent();
SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
@@ -2179,8 +2229,8 @@ public:
DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
MemCheckBlock->moveBefore(LoopVectorPreHeader);
- if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
- PL->addBasicBlockToLoop(MemCheckBlock, *LI);
+ if (OuterLoop)
+ OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bbeb5da2cfec..ae2fc522ba40 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -597,13 +597,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
- // Some vectorized function variants may also take a scalar argument,
- // e.g. linear parameters for pointers.
Value *Arg;
- if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
- (UseIntrinsic &&
- isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
+ if (UseIntrinsic &&
+ isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
Arg = State.get(I.value(), VPIteration(0, 0));
+ // Some vectorized function variants may also take a scalar argument,
+ // e.g. linear parameters for pointers. This needs to be the scalar value
+ // from the start of the respective part when interleaving.
+ else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
+ Arg = State.get(I.value(), VPIteration(Part, 0));
else
Arg = State.get(I.value(), Part);
if (UseIntrinsic &&