src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-07-29 20:15:26 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2021-07-29 20:15:26 +0000
commit	344a3780b2e33f6ca763666c380202b18aab72a3 (patch)
tree	f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
parent	b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
download	src-344a3780b2e33f6ca763666c380202b18aab72a3.tar.gz src-344a3780b2e33f6ca763666c380202b18aab72a3.zip

Vendor import of llvm-project main 88e66fa60ae5, the last commit beforevendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5 vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0e vendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f

the upstream release/13.x branch was created.

Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp')

-rw-r--r--

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

116

1 files changed, 17 insertions, 99 deletions

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 84d72e1b579f..4e3d5fdc012d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

@@ -6,7 +6,7 @@

//===----------------------------------------------------------------------===//

-// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring

+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring

// there is at most one ret and one unreachable instruction, it ensures there is

// at most one divergent exiting block.

@@ -54,6 +54,9 @@ using namespace llvm;

namespace {

class AMDGPUUnifyDivergentExitNodes : public FunctionPass {

+private:

+ const TargetTransformInfo *TTI = nullptr;

public:

static char ID; // Pass identification, replacement for typeid

@@ -63,6 +66,9 @@ public:

// We can preserve non-critical-edgeness when we unify function exit nodes

void getAnalysisUsage(AnalysisUsage &AU) const override;

+ BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,

+ ArrayRef<BasicBlock *> ReturningBlocks,

+ StringRef Name);

bool runOnFunction(Function &F) override;

};

@@ -110,12 +116,9 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{

/// XXX - Is there a more efficient way to find this?

static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,

BasicBlock &BB) {

- SmallVector<BasicBlock *, 8> Stack;

+ SmallVector<BasicBlock *, 8> Stack(predecessors(&BB));

SmallPtrSet<BasicBlock *, 8> Visited;

- for (BasicBlock *Pred : predecessors(&BB))

- Stack.push_back(Pred);

while (!Stack.empty()) {

BasicBlock *Top = Stack.pop_back_val();

if (!DA.isUniform(Top->getTerminator()))

@@ -130,49 +133,15 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,

return true;

}

-static void removeDoneExport(Function &F) {

- ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());

- for (BasicBlock &BB : F) {

- for (Instruction &I : BB) {

- if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {

- if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {

- Intrin->setArgOperand(6, BoolFalse); // done

- } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {

- Intrin->setArgOperand(4, BoolFalse); // done

- }

-static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,

- ArrayRef<BasicBlock *> ReturningBlocks,

- bool InsertExport,

- const TargetTransformInfo &TTI,

- StringRef Name) {

+BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(

+ Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,

+ StringRef Name) {

// Otherwise, we need to insert a new basic block into the function, add a PHI

// nodes (if the function returns values), and convert all of the return

// instructions into unconditional branches.

BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);

IRBuilder<> B(NewRetBlock);

- if (InsertExport) {

- // Ensure that there's only one "done" export in the shader by removing the

- // "done" bit set on the original final export. More than one "done" export

- // can lead to undefined behavior.

- removeDoneExport(F);

- Value *Undef = UndefValue::get(B.getFloatTy());

- B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },

- {

- B.getInt32(AMDGPU::Exp::ET_NULL),

- B.getInt32(0), // enabled channels

- Undef, Undef, Undef, Undef, // values

- B.getTrue(), // done

- B.getTrue(), // valid mask

- });

- }

PHINode *PN = nullptr;

if (F.getReturnType()->isVoidTy()) {

B.CreateRetVoid();

@@ -180,7 +149,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,

// If the function doesn't return void... add a PHI node to the block...

PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),

"UnifiedRetVal");

- assert(!InsertExport);

B.CreateRet(PN);

}

@@ -206,7 +174,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,

for (BasicBlock *BB : ReturningBlocks) {

// Cleanup possible branch to unconditional branch to the return.

- simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,

+ simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr,

SimplifyCFGOptions().bonusInstThreshold(2));

}

@@ -220,25 +188,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {

auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();

- // If there's only one exit, we don't need to do anything, unless this is a

- // pixel shader and that exit is an infinite loop, since we still have to

- // insert an export in that case.

- if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)

+ // If there's only one exit, we don't need to do anything.

+ if (PDT.root_size() <= 1)

return false;

LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();

+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

// Loop over all of the blocks in a function, tracking all of the blocks that

// return.

SmallVector<BasicBlock *, 4> ReturningBlocks;

- SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;

SmallVector<BasicBlock *, 4> UnreachableBlocks;

// Dummy return block for infinite loop.

BasicBlock *DummyReturnBB = nullptr;

- bool InsertExport = false;

bool Changed = false;

std::vector<DominatorTree::UpdateType> Updates;

@@ -246,8 +210,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {

if (isa<ReturnInst>(BB->getTerminator())) {

if (!isUniformlyReached(DA, *BB))

ReturningBlocks.push_back(BB);

- else

- UniformlyReachedRetBlocks.push_back(BB);

} else if (isa<UnreachableInst>(BB->getTerminator())) {

if (!isUniformlyReached(DA, *BB))

UnreachableBlocks.push_back(BB);

@@ -259,36 +221,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {

"DummyReturnBlock", &F);

Type *RetTy = F.getReturnType();

Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);

- // For pixel shaders, the producer guarantees that an export is

- // executed before each return instruction. However, if there is an

- // infinite loop and we insert a return ourselves, we need to uphold

- // that guarantee by inserting a null export. This can happen e.g. in

- // an infinite loop with kill instructions, which is supposed to

- // terminate. However, we don't need to do this if there is a non-void

- // return value, since then there is an epilog afterwards which will

- // still export.

- //

- // Note: In the case where only some threads enter the infinite loop,

- // this can result in the null export happening redundantly after the

- // original exports. However, The last "real" export happens after all

- // the threads that didn't enter an infinite loop converged, which

- // means that the only extra threads to execute the null export are

- // threads that entered the infinite loop, and they only could've

- // exited through being killed which sets their exec bit to 0.

- // Therefore, unless there's an actual infinite loop, which can have

- // invalid results, or there's a kill after the last export, which we

- // assume the frontend won't do, this export will have the same exec

- // mask as the last "real" export, and therefore the valid mask will be

- // overwritten with the same value and will still be correct. Also,

- // even though this forces an extra unnecessary export wait, we assume

- // that this happens rare enough in practice to that we don't have to

- // worry about performance.

- if (F.getCallingConv() == CallingConv::AMDGPU_PS &&

- RetTy->isVoidTy()) {

- InsertExport = true;

- }

ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);

ReturningBlocks.push_back(DummyReturnBB);

}

@@ -380,23 +312,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {

if (ReturningBlocks.empty())

return Changed; // No blocks return

- if (ReturningBlocks.size() == 1 && !InsertExport)

+ if (ReturningBlocks.size() == 1)

return Changed; // Already has a single return block

- const TargetTransformInfo &TTI

- = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

- // Unify returning blocks. If we are going to insert the export it is also

- // necessary to include blocks that are uniformly reached, because in addition

- // to inserting the export the "done" bits on existing exports will be cleared

- // and we do not want to end up with the normal export in a non-unified,

- // uniformly reached block with the "done" bit cleared.

- auto BlocksToUnify = std::move(ReturningBlocks);

- if (InsertExport) {

- llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);

- }

- unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,

- "UnifiedReturnBlock");

+ unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");

return true;

}