src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2024-02-07 14:37:28 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2024-02-07 14:37:28 +0000
commit	4fdf604ba667503ae582304cebdd3df426778a6b (patch)
tree	aae65a5d1b0a1b1acd9389fc36a0baf5fc1918c7 /llvm/lib/Transforms
parent	2d835ae8657273e3aa8b9ef3201fb8df5563af9d (diff)

Vendor import of llvm-project branch release/18.x llvmorg-18.1.0-rc2-0-gc6c86965d967.vendor/llvm-project/llvmorg-18.1.0-rc2-0-gc6c86965d967

Diffstat (limited to 'llvm/lib/Transforms')

-rw-r--r--

llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

-rw-r--r--

llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

-rw-r--r--

llvm/lib/Transforms/Scalar/ConstraintElimination.cpp

-rw-r--r--

llvm/lib/Transforms/Scalar/JumpThreading.cpp

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

6 files changed, 82 insertions, 18 deletions

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index bb2a77daa60a..1254a050027a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

@@ -1032,7 +1032,8 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {

// where there are several consecutive memory accesses to the same location,

// separated by a few arithmetic operations.

bool IsLoadCSE = false;

- if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {

+ BatchAAResults BatchAA(*AA);

+ if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {

if (IsLoadCSE)

combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a8a5f9831e15..79873a9b4cbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

@@ -802,6 +802,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,

return InsertNewInstWith(LShr, I->getIterator());

} else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.

Known.One |= HighBits;

+ // SignBits may be out-of-sync with Known.countMinSignBits(). Mask out

+ // high bits of Known.Zero to avoid conflicts.

+ Known.Zero &= ~HighBits;

}

} else {

computeKnownBits(I, Known, Depth, CxtI);

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 8f09569d0d9c..7b672e89b67a 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp

@@ -1061,11 +1061,16 @@ void State::addInfoFor(BasicBlock &BB) {

FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));

break;

// Enqueue the intrinsics to add extra info.

- case Intrinsic::abs:

case Intrinsic::umin:

case Intrinsic::umax:

case Intrinsic::smin:

case Intrinsic::smax:

+ // TODO: Check if it is possible to instead only added the min/max facts

+ // when simplifying uses of the min/max intrinsics.

+ if (!isGuaranteedNotToBePoison(&I))

+ break;

+ [[fallthrough]];

+ case Intrinsic::abs:

WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));

break;

}

diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 8603c5cf9c02..87c01ead634f 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp

@@ -1260,8 +1260,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {

// the entry to its block.

BasicBlock::iterator BBIt(LoadI);

bool IsLoadCSE;

+ BatchAAResults BatchAA(*AA);

+ // The dominator tree is updated lazily and may not be valid at this point.

+ BatchAA.disableDominatorTree();

if (Value *AvailableVal = FindAvailableLoadedValue(

- LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {

+ LoadI, LoadBB, BBIt, DefMaxInstsToScan, &BatchAA, &IsLoadCSE)) {

// If the value of the load is locally available within the block, just use

// it. This frequently occurs for reg2mem'd allocas.

@@ -1322,9 +1325,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {

MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB),

LocationSize::precise(DL.getTypeStoreSize(AccessTy)),

AATags);

- PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(),

- PredBB, BBIt, DefMaxInstsToScan,

- AA, &IsLoadCSE, &NumScanedInst);

+ PredAvailable = findAvailablePtrLoadStore(

+ Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,

+ &BatchAA, &IsLoadCSE, &NumScanedInst);

// If PredBB has a single predecessor, continue scanning through the

// single predecessor.

@@ -1336,7 +1339,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {

BBIt = SinglePredBB->end();

PredAvailable = findAvailablePtrLoadStore(

Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt,

- (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,

+ (DefMaxInstsToScan - NumScanedInst), &BatchAA, &IsLoadCSE,

&NumScanedInst);

}

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca93e15719f..dd596c567cd4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {

bool CostTooHigh = false;

const bool AddBranchWeights;

+ Loop *OuterLoop = nullptr;

public:

GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,

TargetTransformInfo *TTI, const DataLayout &DL,

@@ -2053,6 +2055,9 @@ public:

DT->eraseNode(SCEVCheckBlock);

LI->removeBlock(SCEVCheckBlock);

}

+ // Outer loop is used as part of the later cost calculations.

+ OuterLoop = L->getParentLoop();

}

InstructionCost getCost() {

@@ -2076,16 +2081,61 @@ public:

LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");

RTCheckCost += C;

}

- if (MemCheckBlock)

+ if (MemCheckBlock) {

+ InstructionCost MemCheckCost = 0;

for (Instruction &I : *MemCheckBlock) {

if (MemCheckBlock->getTerminator() == &I)

continue;

InstructionCost C =

TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);

LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");

- RTCheckCost += C;

+ MemCheckCost += C;

}

+ // If the runtime memory checks are being created inside an outer loop

+ // we should find out if these checks are outer loop invariant. If so,

+ // the checks will likely be hoisted out and so the effective cost will

+ // reduce according to the outer loop trip count.

+ if (OuterLoop) {

+ ScalarEvolution *SE = MemCheckExp.getSE();

+ // TODO: If profitable, we could refine this further by analysing every

+ // individual memory check, since there could be a mixture of loop

+ // variant and invariant checks that mean the final condition is

+ // variant.

+ const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);

+ if (SE->isLoopInvariant(Cond, OuterLoop)) {

+ // It seems reasonable to assume that we can reduce the effective

+ // cost of the checks even when we know nothing about the trip

+ // count. Assume that the outer loop executes at least twice.

+ unsigned BestTripCount = 2;

+ // If exact trip count is known use that.

+ if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))

+ BestTripCount = SmallTC;

+ else if (LoopVectorizeWithBlockFrequency) {

+ // Else use profile data if available.

+ if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))

+ BestTripCount = *EstimatedTC;

+ }

+ InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;

+ // Let's ensure the cost is always at least 1.

+ NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),

+ (InstructionCost::CostType)1);

+ LLVM_DEBUG(dbgs()

+ << "We expect runtime memory checks to be hoisted "

+ << "out of the outer loop. Cost reduced from "

+ << MemCheckCost << " to " << NewMemCheckCost << '\n');

+ MemCheckCost = NewMemCheckCost;

+ }

+ RTCheckCost += MemCheckCost;

+ }

if (SCEVCheckBlock || MemCheckBlock)

LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost

<< "\n");

@@ -2144,8 +2194,8 @@ public:

BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);

// Create new preheader for vector loop.

- if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))

- PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);

+ if (OuterLoop)

+ OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);

SCEVCheckBlock->getTerminator()->eraseFromParent();

SCEVCheckBlock->moveBefore(LoopVectorPreHeader);

@@ -2179,8 +2229,8 @@ public:

DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);

MemCheckBlock->moveBefore(LoopVectorPreHeader);

- if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))

- PL->addBasicBlockToLoop(MemCheckBlock, *LI);

+ if (OuterLoop)

+ OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);

BranchInst &BI =

*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bbeb5da2cfec..ae2fc522ba40 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -597,13 +597,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {

for (const auto &I : enumerate(operands())) {

// Some intrinsics have a scalar argument - don't replace it with a

// vector.

- // Some vectorized function variants may also take a scalar argument,

- // e.g. linear parameters for pointers.

Value *Arg;

- if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||

- (UseIntrinsic &&

- isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))

+ if (UseIntrinsic &&

+ isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))

Arg = State.get(I.value(), VPIteration(0, 0));

+ // Some vectorized function variants may also take a scalar argument,

+ // e.g. linear parameters for pointers. This needs to be the scalar value

+ // from the start of the respective part when interleaving.

+ else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())

+ Arg = State.get(I.value(), VPIteration(Part, 0));

else

Arg = State.get(I.value(), Part);

if (UseIntrinsic &&