From 344a3780b2e33f6ca763666c380202b18aab72a3 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Thu, 29 Jul 2021 22:15:26 +0200
Subject: Vendor import of llvm-project main 88e66fa60ae5, the last commit
 before the upstream release/13.x branch was created.

---
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp       | 116 +++------------------
 1 file changed, 17 insertions(+), 99 deletions(-)

(limited to 'llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp')
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 84d72e1b579f..4e3d5fdc012d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
 // there is at most one ret and one unreachable instruction, it ensures there is
 // at most one divergent exiting block.
 //
@@ -54,6 +54,9 @@ using namespace llvm;
 namespace {
 
 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+private:
+  const TargetTransformInfo *TTI = nullptr;
+
 public:
   static char ID; // Pass identification, replacement for typeid
 
@@ -63,6 +66,9 @@ public:
 
   // We can preserve non-critical-edgeness when we unify function exit nodes
   void getAnalysisUsage(AnalysisUsage &AU) const override;
+  BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
+                                  ArrayRef<BasicBlock *> ReturningBlocks,
+                                  StringRef Name);
   bool runOnFunction(Function &F) override;
 };
 
@@ -110,12 +116,9 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
 /// XXX - Is there a more efficient way to find this?
 static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
                                BasicBlock &BB) {
-  SmallVector<BasicBlock *, 8> Stack;
+  SmallVector<BasicBlock *, 8> Stack(predecessors(&BB));
   SmallPtrSet<BasicBlock *, 8> Visited;
 
-  for (BasicBlock *Pred : predecessors(&BB))
-    Stack.push_back(Pred);
-
   while (!Stack.empty()) {
     BasicBlock *Top = Stack.pop_back_val();
     if (!DA.isUniform(Top->getTerminator()))
@@ -130,49 +133,15 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
   return true;
 }
 
-static void removeDoneExport(Function &F) {
-  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
-        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-          Intrin->setArgOperand(6, BoolFalse); // done
-        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
-          Intrin->setArgOperand(4, BoolFalse); // done
-        }
-      }
-    }
-  }
-}
-
-static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
-                                       ArrayRef<BasicBlock *> ReturningBlocks,
-                                       bool InsertExport,
-                                       const TargetTransformInfo &TTI,
-                                       StringRef Name) {
+BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+    Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
+    StringRef Name) {
   // Otherwise, we need to insert a new basic block into the function, add a PHI
   // nodes (if the function returns values), and convert all of the return
   // instructions into unconditional branches.
   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
   IRBuilder<> B(NewRetBlock);
 
-  if (InsertExport) {
-    // Ensure that there's only one "done" export in the shader by removing the
-    // "done" bit set on the original final export. More than one "done" export
-    // can lead to undefined behavior.
-    removeDoneExport(F);
-
-    Value *Undef = UndefValue::get(B.getFloatTy());
-    B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
-                      {
-                        B.getInt32(AMDGPU::Exp::ET_NULL),
-                        B.getInt32(0), // enabled channels
-                        Undef, Undef, Undef, Undef, // values
-                        B.getTrue(), // done
-                        B.getTrue(), // valid mask
-                      });
-  }
-
   PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
     B.CreateRetVoid();
@@ -180,7 +149,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
     // If the function doesn't return void... add a PHI node to the block...
     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
                      "UnifiedRetVal");
-    assert(!InsertExport);
     B.CreateRet(PN);
   }
 
@@ -206,7 +174,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
 
   for (BasicBlock *BB : ReturningBlocks) {
     // Cleanup possible branch to unconditional branch to the return.
-    simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
+    simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
                 SimplifyCFGOptions().bonusInstThreshold(2));
   }
 
@@ -220,25 +188,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
 
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
 
-  // If there's only one exit, we don't need to do anything, unless this is a
-  // pixel shader and that exit is an infinite loop, since we still have to
-  // insert an export in that case.
-  if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
+  // If there's only one exit, we don't need to do anything.
+  if (PDT.root_size() <= 1)
     return false;
 
   LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
   // return.
   SmallVector<BasicBlock *, 4> ReturningBlocks;
-  SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
   SmallVector<BasicBlock *, 4> UnreachableBlocks;
 
   // Dummy return block for infinite loop.
   BasicBlock *DummyReturnBB = nullptr;
 
-  bool InsertExport = false;
-
   bool Changed = false;
   std::vector<DominatorTree::UpdateType> Updates;
 
@@ -246,8 +210,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         ReturningBlocks.push_back(BB);
-      else
-        UniformlyReachedRetBlocks.push_back(BB);
     } else if (isa<UnreachableInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         UnreachableBlocks.push_back(BB);
@@ -259,36 +221,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
                                            "DummyReturnBlock", &F);
         Type *RetTy = F.getReturnType();
         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-
-        // For pixel shaders, the producer guarantees that an export is
-        // executed before each return instruction. However, if there is an
-        // infinite loop and we insert a return ourselves, we need to uphold
-        // that guarantee by inserting a null export. This can happen e.g. in
-        // an infinite loop with kill instructions, which is supposed to
-        // terminate. However, we don't need to do this if there is a non-void
-        // return value, since then there is an epilog afterwards which will
-        // still export.
-        //
-        // Note: In the case where only some threads enter the infinite loop,
-        // this can result in the null export happening redundantly after the
-        // original exports. However, The last "real" export happens after all
-        // the threads that didn't enter an infinite loop converged, which
-        // means that the only extra threads to execute the null export are
-        // threads that entered the infinite loop, and they only could've
-        // exited through being killed which sets their exec bit to 0.
-        // Therefore, unless there's an actual infinite loop, which can have
-        // invalid results, or there's a kill after the last export, which we
-        // assume the frontend won't do, this export will have the same exec
-        // mask as the last "real" export, and therefore the valid mask will be
-        // overwritten with the same value and will still be correct. Also,
-        // even though this forces an extra unnecessary export wait, we assume
-        // that this happens rare enough in practice to that we don't have to
-        // worry about performance.
-        if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
-            RetTy->isVoidTy()) {
-          InsertExport = true;
-        }
-
         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
         ReturningBlocks.push_back(DummyReturnBB);
       }
@@ -380,23 +312,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   if (ReturningBlocks.empty())
     return Changed; // No blocks return
 
-  if (ReturningBlocks.size() == 1 && !InsertExport)
+  if (ReturningBlocks.size() == 1)
     return Changed; // Already has a single return block
 
-  const TargetTransformInfo &TTI
-    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
-  // Unify returning blocks. If we are going to insert the export it is also
-  // necessary to include blocks that are uniformly reached, because in addition
-  // to inserting the export the "done" bits on existing exports will be cleared
-  // and we do not want to end up with the normal export in a non-unified,
-  // uniformly reached block with the "done" bit cleared.
-  auto BlocksToUnify = std::move(ReturningBlocks);
-  if (InsertExport) {
-    llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
-  }
-
-  unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
-                      "UnifiedReturnBlock");
+  unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
   return true;
 }
-- 
cgit v1.2.3