aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Scalar
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-02-16 20:13:02 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-02-16 20:13:02 +0000
commitb60736ec1405bb0a8dd40989f67ef4c93da068ab (patch)
tree5c43fbb7c9fc45f0f87e0e6795a86267dbd12f9d /llvm/lib/Transforms/Scalar
parentcfca06d7963fa0909f90483b42a6d7d194d01e08 (diff)
downloadsrc-b60736ec1405bb0a8dd40989f67ef4c93da068ab.tar.gz
src-b60736ec1405bb0a8dd40989f67ef4c93da068ab.zip
Vendor import of llvm-project main 8e464dd76bef, the last commit beforevendor/llvm-project/llvmorg-12-init-17869-g8e464dd76bef
the upstream release/12.x branch was created.
Diffstat (limited to 'llvm/lib/Transforms/Scalar')
-rw-r--r--llvm/lib/Transforms/Scalar/ADCE.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp90
-rw-r--r--llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/ConstantHoisting.cpp15
-rw-r--r--llvm/lib/Transforms/Scalar/ConstantProp.cpp121
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp407
-rw-r--r--llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp339
-rw-r--r--llvm/lib/Transforms/Scalar/DCE.cpp66
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp1066
-rw-r--r--llvm/lib/Transforms/Scalar/DivRemPairs.cpp10
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp432
-rw-r--r--llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/Float2Int.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp417
-rw-r--r--llvm/lib/Transforms/Scalar/GVNHoist.cpp1383
-rw-r--r--llvm/lib/Transforms/Scalar/GVNSink.cpp13
-rw-r--r--llvm/lib/Transforms/Scalar/GuardWidening.cpp14
-rw-r--r--llvm/lib/Transforms/Scalar/IndVarSimplify.cpp1308
-rw-r--r--llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp104
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp160
-rw-r--r--llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp350
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp333
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDeletion.cpp129
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDistribute.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFlatten.cpp728
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp329
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp534
-rw-r--r--llvm/lib/Transforms/Scalar/LoopInterchange.cpp135
-rw-r--r--llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp63
-rw-r--r--llvm/lib/Transforms/Scalar/LoopPassManager.cpp313
-rw-r--r--llvm/lib/Transforms/Scalar/LoopPredication.cpp15
-rw-r--r--llvm/lib/Transforms/Scalar/LoopRerollPass.cpp66
-rw-r--r--llvm/lib/Transforms/Scalar/LoopRotation.cpp45
-rw-r--r--llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp28
-rw-r--r--llvm/lib/Transforms/Scalar/LoopSink.cpp148
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp139
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp92
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnswitch.cpp321
-rw-r--r--llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp110
-rw-r--r--llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp21
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp291
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp860
-rw-r--r--llvm/lib/Transforms/Scalar/MergeICmps.cpp24
-rw-r--r--llvm/lib/Transforms/Scalar/NaryReassociate.cpp90
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp127
-rw-r--r--llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/Reassociate.cpp114
-rw-r--r--llvm/lib/Transforms/Scalar/Reg2Mem.cpp120
-rw-r--r--llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp147
-rw-r--r--llvm/lib/Transforms/Scalar/SCCP.cpp412
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp313
-rw-r--r--llvm/lib/Transforms/Scalar/Scalar.cpp37
-rw-r--r--llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp948
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp84
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp74
-rw-r--r--llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp188
-rw-r--r--llvm/lib/Transforms/Scalar/Sink.cpp80
-rw-r--r--llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp101
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp171
-rw-r--r--llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp146
-rw-r--r--llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp6
68 files changed, 9480 insertions, 4775 deletions
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index c3709b9afffb..2b649732a799 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -643,7 +643,7 @@ void AggressiveDeadCodeElimination::computeReversePostOrder() {
SmallPtrSet<BasicBlock*, 16> Visited;
unsigned PostOrder = 0;
for (auto &BB : F) {
- if (succ_begin(&BB) != succ_end(&BB))
+ if (!succ_empty(&BB))
continue;
for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
BlockInfo[Block].PostOrder = PostOrder++;
diff --git a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
new file mode 100644
index 000000000000..a02d88fe066f
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
@@ -0,0 +1,90 @@
+//===-- AnnotationRemarks.cpp - Generate remarks for annotated instrs. ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generate remarks for instructions marked with !annotation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace llvm::ore;
+
+#define DEBUG_TYPE "annotation-remarks"
+#define REMARK_PASS DEBUG_TYPE
+
+static void runImpl(Function &F) {
+ if (!OptimizationRemarkEmitter::allowExtraAnalysis(F, REMARK_PASS))
+ return;
+
+ OptimizationRemarkEmitter ORE(&F);
+ // For now, just generate a summary of the annotated instructions.
+ MapVector<StringRef, unsigned> Mapping;
+ for (Instruction &I : instructions(F)) {
+ if (!I.hasMetadata(LLVMContext::MD_annotation))
+ continue;
+ for (const MDOperand &Op :
+ I.getMetadata(LLVMContext::MD_annotation)->operands()) {
+ auto Iter = Mapping.insert({cast<MDString>(Op.get())->getString(), 0});
+ Iter.first->second++;
+ }
+ }
+
+ Instruction *IP = &*F.begin()->begin();
+ for (const auto &KV : Mapping)
+ ORE.emit(OptimizationRemarkAnalysis(REMARK_PASS, "AnnotationSummary", IP)
+ << "Annotated " << NV("count", KV.second) << " instructions with "
+ << NV("type", KV.first));
+}
+
+namespace {
+
+struct AnnotationRemarksLegacy : public FunctionPass {
+ static char ID;
+
+ AnnotationRemarksLegacy() : FunctionPass(ID) {
+ initializeAnnotationRemarksLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ runImpl(F);
+ return false;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+char AnnotationRemarksLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AnnotationRemarksLegacy, "annotation-remarks",
+ "Annotation Remarks", false, false)
+INITIALIZE_PASS_END(AnnotationRemarksLegacy, "annotation-remarks",
+ "Annotation Remarks", false, false)
+
+FunctionPass *llvm::createAnnotationRemarksLegacyPass() {
+ return new AnnotationRemarksLegacy();
+}
+
+PreservedAnalyses AnnotationRemarksPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ runImpl(F);
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index b26bd1114bd4..2eb94b721d96 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -208,7 +208,7 @@ static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
// instructions before the call is less then DuplicationThreshold. The
// instructions before the call will be duplicated in the split blocks and
// corresponding uses will be updated.
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
for (auto &InstBeforeCall :
llvm::make_range(CallSiteBB->begin(), CB.getIterator())) {
Cost += TTI.getInstructionCost(&InstBeforeCall,
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 7c14b69d658d..fdab74fc94c5 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -366,9 +366,9 @@ void ConstantHoistingPass::collectConstantCandidates(
ConstInt->getValue(), ConstInt->getType(),
TargetTransformInfo::TCK_SizeAndLatency);
else
- Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(),
- ConstInt->getType(),
- TargetTransformInfo::TCK_SizeAndLatency);
+ Cost = TTI->getIntImmCostInst(
+ Inst->getOpcode(), Idx, ConstInt->getValue(), ConstInt->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency, Inst);
// Ignore cheap integer constants.
if (Cost > TargetTransformInfo::TCC_Basic) {
@@ -418,8 +418,9 @@ void ConstantHoistingPass::collectConstantCandidates(
// usually lowered to a load from constant pool. Such operation is unlikely
// to be cheaper than compute it by <Base + Offset>, which can be lowered to
// an ADD instruction or folded into Load/Store instruction.
- int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
- TargetTransformInfo::TCK_SizeAndLatency);
+ int Cost =
+ TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
+ TargetTransformInfo::TCK_SizeAndLatency, Inst);
ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
ConstCandMapType::iterator Itr;
bool Inserted;
@@ -950,7 +951,7 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
// base constant.
if (!ConstIntCandVec.empty())
findBaseConstants(nullptr);
- for (auto &MapEntry : ConstGEPCandMap)
+ for (const auto &MapEntry : ConstGEPCandMap)
if (!MapEntry.second.empty())
findBaseConstants(MapEntry.first);
@@ -959,7 +960,7 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
bool MadeChange = false;
if (!ConstIntInfoVec.empty())
MadeChange = emitBaseConstants(nullptr);
- for (auto MapEntry : ConstGEPInfoMap)
+ for (const auto &MapEntry : ConstGEPInfoMap)
if (!MapEntry.second.empty())
MadeChange |= emitBaseConstants(MapEntry.first);
diff --git a/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/llvm/lib/Transforms/Scalar/ConstantProp.cpp
deleted file mode 100644
index 73bf1d521b1d..000000000000
--- a/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements constant propagation and merging:
-//
-// Specifically, this:
-// * Converts instructions like "add int 1, 2" into 3
-//
-// Notice that:
-// * This pass has a habit of making definitions be dead. It is a good idea
-// to run a DIE pass sometime after running this pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "constprop"
-
-STATISTIC(NumInstKilled, "Number of instructions killed");
-DEBUG_COUNTER(CPCounter, "constprop-transform",
- "Controls which instructions are killed");
-
-namespace {
- struct ConstantPropagation : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- ConstantPropagation() : FunctionPass(ID) {
- initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
- };
-}
-
-char ConstantPropagation::ID = 0;
-INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
- "Simple constant propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ConstantPropagation, "constprop",
- "Simple constant propagation", false, false)
-
-FunctionPass *llvm::createConstantPropagationPass() {
- return new ConstantPropagation();
-}
-
-bool ConstantPropagation::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- // Initialize the worklist to all of the instructions ready to process...
- SmallPtrSet<Instruction *, 16> WorkList;
- // The SmallVector of WorkList ensures that we do iteration at stable order.
- // We use two containers rather than one SetVector, since remove is
- // linear-time, and we don't care enough to remove from Vec.
- SmallVector<Instruction *, 16> WorkListVec;
- for (Instruction &I : instructions(&F)) {
- WorkList.insert(&I);
- WorkListVec.push_back(&I);
- }
-
- bool Changed = false;
- const DataLayout &DL = F.getParent()->getDataLayout();
- TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
- while (!WorkList.empty()) {
- SmallVector<Instruction*, 16> NewWorkListVec;
- for (auto *I : WorkListVec) {
- WorkList.erase(I); // Remove element from the worklist...
-
- if (!I->use_empty()) // Don't muck with dead instructions...
- if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
- if (!DebugCounter::shouldExecute(CPCounter))
- continue;
-
- // Add all of the users of this instruction to the worklist, they might
- // be constant propagatable now...
- for (User *U : I->users()) {
- // If user not in the set, then add it to the vector.
- if (WorkList.insert(cast<Instruction>(U)).second)
- NewWorkListVec.push_back(cast<Instruction>(U));
- }
-
- // Replace all of the uses of a variable with uses of the constant.
- I->replaceAllUsesWith(C);
-
- if (isInstructionTriviallyDead(I, TLI)) {
- I->eraseFromParent();
- ++NumInstKilled;
- }
-
- // We made a change to the function...
- Changed = true;
- }
- }
- WorkListVec = std::move(NewWorkListVec);
- }
- return Changed;
-}
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
new file mode 100644
index 000000000000..3b8af6f21ce5
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -0,0 +1,407 @@
+//===-- ConstraintElimination.cpp - Eliminate conds using constraints. ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Eliminate conditions based on constraints collected from dominating
+// conditions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ConstraintElimination.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstraintSystem.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "constraint-elimination"
+
+STATISTIC(NumCondsRemoved, "Number of instructions removed");
+DEBUG_COUNTER(EliminatedCounter, "conds-eliminated",
+ "Controls which conditions are eliminated");
+
+static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max();
+
+// Decomposes \p V into a vector of pairs of the form { c, X } where c * X. The
+// sum of the pairs equals \p V. The first pair is the constant-factor and X
+// must be nullptr. If the expression cannot be decomposed, returns an empty
+// vector.
+static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
+ if (auto *CI = dyn_cast<ConstantInt>(V)) {
+ if (CI->isNegative() || CI->uge(MaxConstraintValue))
+ return {};
+ return {{CI->getSExtValue(), nullptr}};
+ }
+ auto *GEP = dyn_cast<GetElementPtrInst>(V);
+ if (GEP && GEP->getNumOperands() == 2) {
+ if (isa<ConstantInt>(GEP->getOperand(GEP->getNumOperands() - 1))) {
+ return {{cast<ConstantInt>(GEP->getOperand(GEP->getNumOperands() - 1))
+ ->getSExtValue(),
+ nullptr},
+ {1, GEP->getPointerOperand()}};
+ }
+ Value *Op0;
+ ConstantInt *CI;
+ if (match(GEP->getOperand(GEP->getNumOperands() - 1),
+ m_NUWShl(m_Value(Op0), m_ConstantInt(CI))))
+ return {{0, nullptr},
+ {1, GEP->getPointerOperand()},
+ {std::pow(int64_t(2), CI->getSExtValue()), Op0}};
+ if (match(GEP->getOperand(GEP->getNumOperands() - 1),
+ m_ZExt(m_NUWShl(m_Value(Op0), m_ConstantInt(CI)))))
+ return {{0, nullptr},
+ {1, GEP->getPointerOperand()},
+ {std::pow(int64_t(2), CI->getSExtValue()), Op0}};
+
+ return {{0, nullptr},
+ {1, GEP->getPointerOperand()},
+ {1, GEP->getOperand(GEP->getNumOperands() - 1)}};
+ }
+
+ Value *Op0;
+ Value *Op1;
+ ConstantInt *CI;
+ if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))))
+ return {{CI->getSExtValue(), nullptr}, {1, Op0}};
+ if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1))))
+ return {{0, nullptr}, {1, Op0}, {1, Op1}};
+
+ if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))))
+ return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}};
+ if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1))))
+ return {{0, nullptr}, {1, Op0}, {1, Op1}};
+
+ return {{0, nullptr}, {1, V}};
+}
+
+/// Turn a condition \p CmpI into a constraint vector, using indices from \p
+/// Value2Index. If \p ShouldAdd is true, new indices are added for values not
+/// yet in \p Value2Index.
+static SmallVector<int64_t, 8>
+getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
+ DenseMap<Value *, unsigned> &Value2Index, bool ShouldAdd) {
+ int64_t Offset1 = 0;
+ int64_t Offset2 = 0;
+
+ auto TryToGetIndex = [ShouldAdd,
+ &Value2Index](Value *V) -> Optional<unsigned> {
+ if (ShouldAdd) {
+ Value2Index.insert({V, Value2Index.size() + 1});
+ return Value2Index[V];
+ }
+ auto I = Value2Index.find(V);
+ if (I == Value2Index.end())
+ return None;
+ return I->second;
+ };
+
+ if (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE)
+ return getConstraint(CmpInst::getSwappedPredicate(Pred), Op1, Op0,
+ Value2Index, ShouldAdd);
+
+ // Only ULE and ULT predicates are supported at the moment.
+ if (Pred != CmpInst::ICMP_ULE && Pred != CmpInst::ICMP_ULT)
+ return {};
+
+ auto ADec = decompose(Op0);
+ auto BDec = decompose(Op1);
+ // Skip if decomposing either of the values failed.
+ if (ADec.empty() || BDec.empty())
+ return {};
+
+ // Skip trivial constraints without any variables.
+ if (ADec.size() == 1 && BDec.size() == 1)
+ return {};
+
+ Offset1 = ADec[0].first;
+ Offset2 = BDec[0].first;
+ Offset1 *= -1;
+
+ // Create iterator ranges that skip the constant-factor.
+ auto VariablesA = make_range(std::next(ADec.begin()), ADec.end());
+ auto VariablesB = make_range(std::next(BDec.begin()), BDec.end());
+
+ // Check if each referenced value in the constraint is already in the system
+ // or can be added (if ShouldAdd is true).
+ for (const auto &KV :
+ concat<std::pair<int64_t, Value *>>(VariablesA, VariablesB))
+ if (!TryToGetIndex(KV.second))
+ return {};
+
+ // Build result constraint, by first adding all coefficients from A and then
+ // subtracting all coefficients from B.
+ SmallVector<int64_t, 8> R(Value2Index.size() + 1, 0);
+ for (const auto &KV : VariablesA)
+ R[Value2Index[KV.second]] += KV.first;
+
+ for (const auto &KV : VariablesB)
+ R[Value2Index[KV.second]] -= KV.first;
+
+ R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0);
+ return R;
+}
+
+static SmallVector<int64_t, 8>
+getConstraint(CmpInst *Cmp, DenseMap<Value *, unsigned> &Value2Index,
+ bool ShouldAdd) {
+ return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0),
+ Cmp->getOperand(1), Value2Index, ShouldAdd);
+}
+
+namespace {
+/// Represents either a condition that holds on entry to a block or a basic
+/// block, with their respective Dominator DFS in and out numbers.
+struct ConstraintOrBlock {
+ unsigned NumIn;
+ unsigned NumOut;
+ bool IsBlock;
+ bool Not;
+ union {
+ BasicBlock *BB;
+ CmpInst *Condition;
+ };
+
+ ConstraintOrBlock(DomTreeNode *DTN)
+ : NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), IsBlock(true),
+ BB(DTN->getBlock()) {}
+ ConstraintOrBlock(DomTreeNode *DTN, CmpInst *Condition, bool Not)
+ : NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), IsBlock(false),
+ Not(Not), Condition(Condition) {}
+};
+
+struct StackEntry {
+ unsigned NumIn;
+ unsigned NumOut;
+ CmpInst *Condition;
+ bool IsNot;
+
+ StackEntry(unsigned NumIn, unsigned NumOut, CmpInst *Condition, bool IsNot)
+ : NumIn(NumIn), NumOut(NumOut), Condition(Condition), IsNot(IsNot) {}
+};
+} // namespace
+
+static bool eliminateConstraints(Function &F, DominatorTree &DT) {
+ bool Changed = false;
+ DT.updateDFSNumbers();
+ ConstraintSystem CS;
+
+ SmallVector<ConstraintOrBlock, 64> WorkList;
+
+ // First, collect conditions implied by branches and blocks with their
+ // Dominator DFS in and out numbers.
+ for (BasicBlock &BB : F) {
+ if (!DT.getNode(&BB))
+ continue;
+ WorkList.emplace_back(DT.getNode(&BB));
+
+ auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
+
+ // If the condition is an OR of 2 compares and the false successor only has
+ // the current block as predecessor, queue both negated conditions for the
+ // false successor.
+ Value *Op0, *Op1;
+ if (match(Br->getCondition(), m_LogicalOr(m_Value(Op0), m_Value(Op1))) &&
+ match(Op0, m_Cmp()) && match(Op1, m_Cmp())) {
+ BasicBlock *FalseSuccessor = Br->getSuccessor(1);
+ if (FalseSuccessor->getSinglePredecessor()) {
+ WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op0),
+ true);
+ WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op1),
+ true);
+ }
+ continue;
+ }
+
+ // If the condition is an AND of 2 compares and the true successor only has
+ // the current block as predecessor, queue both conditions for the true
+ // successor.
+ if (match(Br->getCondition(), m_LogicalAnd(m_Value(Op0), m_Value(Op1))) &&
+ match(Op0, m_Cmp()) && match(Op1, m_Cmp())) {
+ BasicBlock *TrueSuccessor = Br->getSuccessor(0);
+ if (TrueSuccessor->getSinglePredecessor()) {
+ WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op0),
+ false);
+ WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op1),
+ false);
+ }
+ continue;
+ }
+
+ auto *CmpI = dyn_cast<CmpInst>(Br->getCondition());
+ if (!CmpI)
+ continue;
+ if (Br->getSuccessor(0)->getSinglePredecessor())
+ WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false);
+ if (Br->getSuccessor(1)->getSinglePredecessor())
+ WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true);
+ }
+
+ // Next, sort worklist by dominance, so that dominating blocks and conditions
+ // come before blocks and conditions dominated by them. If a block and a
+ // condition have the same numbers, the condition comes before the block, as
+ // it holds on entry to the block.
+ sort(WorkList, [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) {
+ return std::tie(A.NumIn, A.IsBlock) < std::tie(B.NumIn, B.IsBlock);
+ });
+
+ // Finally, process ordered worklist and eliminate implied conditions.
+ SmallVector<StackEntry, 16> DFSInStack;
+ DenseMap<Value *, unsigned> Value2Index;
+ for (ConstraintOrBlock &CB : WorkList) {
+ // First, pop entries from the stack that are out-of-scope for CB. Remove
+ // the corresponding entry from the constraint system.
+ while (!DFSInStack.empty()) {
+ auto &E = DFSInStack.back();
+ LLVM_DEBUG(dbgs() << "Top of stack : " << E.NumIn << " " << E.NumOut
+ << "\n");
+ LLVM_DEBUG(dbgs() << "CB: " << CB.NumIn << " " << CB.NumOut << "\n");
+ assert(E.NumIn <= CB.NumIn);
+ if (CB.NumOut <= E.NumOut)
+ break;
+ LLVM_DEBUG(dbgs() << "Removing " << *E.Condition << " " << E.IsNot
+ << "\n");
+ DFSInStack.pop_back();
+ CS.popLastConstraint();
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Processing ";
+ if (CB.IsBlock)
+ dbgs() << *CB.BB;
+ else
+ dbgs() << *CB.Condition;
+ dbgs() << "\n";
+ });
+
+ // For a block, check if any CmpInsts become known based on the current set
+ // of constraints.
+ if (CB.IsBlock) {
+ for (Instruction &I : *CB.BB) {
+ auto *Cmp = dyn_cast<CmpInst>(&I);
+ if (!Cmp)
+ continue;
+ auto R = getConstraint(Cmp, Value2Index, false);
+ if (R.empty() || R.size() == 1)
+ continue;
+ if (CS.isConditionImplied(R)) {
+ if (!DebugCounter::shouldExecute(EliminatedCounter))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Condition " << *Cmp
+ << " implied by dominating constraints\n");
+ LLVM_DEBUG({
+ for (auto &E : reverse(DFSInStack))
+ dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n";
+ });
+ Cmp->replaceAllUsesWith(
+ ConstantInt::getTrue(F.getParent()->getContext()));
+ NumCondsRemoved++;
+ Changed = true;
+ }
+ if (CS.isConditionImplied(ConstraintSystem::negate(R))) {
+ if (!DebugCounter::shouldExecute(EliminatedCounter))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Condition !" << *Cmp
+ << " implied by dominating constraints\n");
+ LLVM_DEBUG({
+ for (auto &E : reverse(DFSInStack))
+ dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n";
+ });
+ Cmp->replaceAllUsesWith(
+ ConstantInt::getFalse(F.getParent()->getContext()));
+ NumCondsRemoved++;
+ Changed = true;
+ }
+ }
+ continue;
+ }
+
+ // Otherwise, add the condition to the system and stack, if we can transform
+ // it into a constraint.
+ auto R = getConstraint(CB.Condition, Value2Index, true);
+ if (R.empty())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
+ if (CB.Not)
+ R = ConstraintSystem::negate(R);
+
+ // If R has been added to the system, queue it for removal once it goes
+ // out-of-scope.
+ if (CS.addVariableRowFill(R))
+ DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not);
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses ConstraintEliminationPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!eliminateConstraints(F, DT))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+
+class ConstraintElimination : public FunctionPass {
+public:
+ static char ID;
+
+ ConstraintElimination() : FunctionPass(ID) {
+ initializeConstraintEliminationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return eliminateConstraints(F, DT);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char ConstraintElimination::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ConstraintElimination, "constraint-elimination",
+ "Constraint Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(ConstraintElimination, "constraint-elimination",
+ "Constraint Elimination", false, false)
+
+FunctionPass *llvm::createConstraintEliminationPass() {
+ return new ConstraintElimination();
+}
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index cd2f4ca36f3b..b671d68031a8 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -58,8 +58,11 @@ STATISTIC(NumMemAccess, "Number of memory access targets propagated");
STATISTIC(NumCmps, "Number of comparisons propagated");
STATISTIC(NumReturns, "Number of return values propagated");
STATISTIC(NumDeadCases, "Number of switch cases removed");
+STATISTIC(NumSDivSRemsNarrowed,
+ "Number of sdivs/srems whose width was decreased");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
-STATISTIC(NumUDivs, "Number of udivs whose width was decreased");
+STATISTIC(NumUDivURemsNarrowed,
+ "Number of udivs/urems whose width was decreased");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
STATISTIC(NumSExt, "Number of sext converted to zext");
@@ -126,7 +129,7 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
if (S->getType()->isVectorTy()) return false;
if (isa<Constant>(S->getCondition())) return false;
- Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S);
+ Constant *C = LVI->getConstant(S->getCondition(), S);
if (!C) return false;
ConstantInt *CI = dyn_cast<ConstantInt>(C);
@@ -283,7 +286,7 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
if (isa<Constant>(Pointer)) return false;
- Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
+ Constant *C = LVI->getConstant(Pointer, I);
if (!C) return false;
++NumMemAccess;
@@ -301,18 +304,9 @@ static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
if (!C)
return false;
- // As a policy choice, we choose not to waste compile time on anything where
- // the comparison is testing local values. While LVI can sometimes reason
- // about such cases, it's not its primary purpose. We do make sure to do
- // the block local query for uses from terminator instructions, but that's
- // handled in the code for each terminator. As an exception, we allow phi
- // nodes, for which LVI can thread the condition into predecessors.
- auto *I = dyn_cast<Instruction>(Op0);
- if (I && I->getParent() == Cmp->getParent() && !isa<PHINode>(I))
- return false;
-
LazyValueInfo::Tristate Result =
- LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp);
+ LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp,
+ /*UseBlockValue=*/true);
if (Result == LazyValueInfo::Unknown)
return false;
@@ -336,15 +330,6 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
Value *Cond = I->getCondition();
BasicBlock *BB = I->getParent();
- // If the condition was defined in same block as the switch then LazyValueInfo
- // currently won't say anything useful about it, though in theory it could.
- if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
- return false;
-
- // If the switch is unreachable then trying to improve it is a waste of time.
- pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
- if (PB == PE) return false;
-
// Analyse each switch case in turn.
bool Changed = false;
DenseMap<BasicBlock*, int> SuccessorsCount;
@@ -357,35 +342,9 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
ConstantInt *Case = CI->getCaseValue();
-
- // Check to see if the switch condition is equal to/not equal to the case
- // value on every incoming edge, equal/not equal being the same each time.
- LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
- for (pred_iterator PI = PB; PI != PE; ++PI) {
- // Is the switch condition equal to the case value?
- LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
- Cond, Case, *PI,
- BB, SI);
- // Give up on this case if nothing is known.
- if (Value == LazyValueInfo::Unknown) {
- State = LazyValueInfo::Unknown;
- break;
- }
-
- // If this was the first edge to be visited, record that all other edges
- // need to give the same result.
- if (PI == PB) {
- State = Value;
- continue;
- }
-
- // If this case is known to fire for some edges and known not to fire for
- // others then there is nothing we can do - give up.
- if (Value != State) {
- State = LazyValueInfo::Unknown;
- break;
- }
- }
+ LazyValueInfo::Tristate State =
+ LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
+ /* UseBlockValue */ true);
if (State == LazyValueInfo::False) {
// This case never fires - remove it.
@@ -429,10 +388,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
// See if we can prove that the given binary op intrinsic will not overflow.
static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
- ConstantRange LRange = LVI->getConstantRange(
- BO->getLHS(), BO->getParent(), BO);
- ConstantRange RRange = LVI->getConstantRange(
- BO->getRHS(), BO->getParent(), BO);
+ ConstantRange LRange = LVI->getConstantRange(BO->getLHS(), BO);
+ ConstantRange RRange = LVI->getConstantRange(BO->getRHS(), BO);
ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
BO->getBinaryOp(), RRange, BO->getNoWrapKind());
return NWRegion.contains(LRange);
@@ -532,8 +489,6 @@ static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
/// Infer nonnull attributes for the arguments at the specified callsite.
static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
- SmallVector<unsigned, 4> ArgNos;
- unsigned ArgNo = 0;
if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
@@ -549,6 +504,8 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
}
}
+ bool Changed = false;
+
// Deopt bundle operands are intended to capture state with minimal
// perturbance of the code otherwise. If we can find a constant value for
// any such operand and remove a use of the original value, that's
@@ -557,22 +514,22 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
// idiomatically, appear along rare conditional paths, it's reasonable likely
// we may have a conditional fact with which LVI can fold.
if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) {
- bool Progress = false;
for (const Use &ConstU : DeoptBundle->Inputs) {
Use &U = const_cast<Use&>(ConstU);
Value *V = U.get();
if (V->getType()->isVectorTy()) continue;
if (isa<Constant>(V)) continue;
- Constant *C = LVI->getConstant(V, CB.getParent(), &CB);
+ Constant *C = LVI->getConstant(V, &CB);
if (!C) continue;
U.set(C);
- Progress = true;
+ Changed = true;
}
- if (Progress)
- return true;
}
+ SmallVector<unsigned, 4> ArgNos;
+ unsigned ArgNo = 0;
+
for (Value *V : CB.args()) {
PointerType *Type = dyn_cast<PointerType>(V->getType());
// Try to mark pointer typed parameters as non-null. We skip the
@@ -590,7 +547,7 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
assert(ArgNo == CB.arg_size() && "sanity check");
if (ArgNos.empty())
- return false;
+ return Changed;
AttributeList AS = CB.getAttributes();
LLVMContext &Ctx = CB.getContext();
@@ -601,13 +558,79 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
return true;
}
-static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
- Constant *Zero = ConstantInt::get(SDI->getType(), 0);
- for (Value *O : SDI->operands()) {
- auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
- if (Result != LazyValueInfo::True)
- return false;
+static bool isNonNegative(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
+ Constant *Zero = ConstantInt::get(V->getType(), 0);
+ auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, V, Zero, CxtI);
+ return Result == LazyValueInfo::True;
+}
+
+static bool isNonPositive(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
+ Constant *Zero = ConstantInt::get(V->getType(), 0);
+ auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SLE, V, Zero, CxtI);
+ return Result == LazyValueInfo::True;
+}
+
+enum class Domain { NonNegative, NonPositive, Unknown };
+
+Domain getDomain(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
+ if (isNonNegative(V, LVI, CxtI))
+ return Domain::NonNegative;
+ if (isNonPositive(V, LVI, CxtI))
+ return Domain::NonPositive;
+ return Domain::Unknown;
+}
+
+/// Try to shrink a sdiv/srem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+ assert(Instr->getOpcode() == Instruction::SDiv ||
+ Instr->getOpcode() == Instruction::SRem);
+ if (Instr->getType()->isVectorTy())
+ return false;
+
+ // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+ // operands.
+ unsigned OrigWidth = Instr->getType()->getIntegerBitWidth();
+
+ // What is the smallest bit width that can accomodate the entire value ranges
+ // of both of the operands?
+ std::array<Optional<ConstantRange>, 2> CRs;
+ unsigned MinSignedBits = 0;
+ for (auto I : zip(Instr->operands(), CRs)) {
+ std::get<1>(I) = LVI->getConstantRange(std::get<0>(I), Instr);
+ MinSignedBits = std::max(std::get<1>(I)->getMinSignedBits(), MinSignedBits);
}
+
+ // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can
+ // prove that such a combination is impossible, we need to bump the bitwidth.
+ if (CRs[1]->contains(APInt::getAllOnesValue(OrigWidth)) &&
+ CRs[0]->contains(
+ APInt::getSignedMinValue(MinSignedBits).sextOrSelf(OrigWidth)))
+ ++MinSignedBits;
+
+ // Don't shrink below 8 bits wide.
+ unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MinSignedBits), 8);
+
+ // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+ // two.
+ if (NewWidth >= OrigWidth)
+ return false;
+
+ ++NumSDivSRemsNarrowed;
+ IRBuilder<> B{Instr};
+ auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+ auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
+ Instr->getName() + ".lhs.trunc");
+ auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
+ Instr->getName() + ".rhs.trunc");
+ auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName());
+ auto *Sext = B.CreateSExt(BO, Instr->getType(), Instr->getName() + ".sext");
+ if (auto *BinOp = dyn_cast<BinaryOperator>(BO))
+ if (BinOp->getOpcode() == Instruction::SDiv)
+ BinOp->setIsExact(Instr->isExact());
+
+ Instr->replaceAllUsesWith(Sext);
+ Instr->eraseFromParent();
return true;
}
@@ -621,21 +644,23 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
// Find the smallest power of two bitwidth that's sufficient to hold Instr's
// operands.
- auto OrigWidth = Instr->getType()->getIntegerBitWidth();
- ConstantRange OperandRange(OrigWidth, /*isFullSet=*/false);
+
+ // What is the smallest bit width that can accomodate the entire value ranges
+ // of both of the operands?
+ unsigned MaxActiveBits = 0;
for (Value *Operand : Instr->operands()) {
- OperandRange = OperandRange.unionWith(
- LVI->getConstantRange(Operand, Instr->getParent()));
+ ConstantRange CR = LVI->getConstantRange(Operand, Instr);
+ MaxActiveBits = std::max(CR.getActiveBits(), MaxActiveBits);
}
// Don't shrink below 8 bits wide.
- unsigned NewWidth = std::max<unsigned>(
- PowerOf2Ceil(OperandRange.getUnsignedMax().getActiveBits()), 8);
+ unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
+
// NewWidth might be greater than OrigWidth if OrigWidth is not a power of
// two.
- if (NewWidth >= OrigWidth)
+ if (NewWidth >= Instr->getType()->getIntegerBitWidth())
return false;
- ++NumUDivs;
+ ++NumUDivURemsNarrowed;
IRBuilder<> B{Instr};
auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
@@ -654,52 +679,135 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
}
static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
+ assert(SDI->getOpcode() == Instruction::SRem);
+ if (SDI->getType()->isVectorTy())
return false;
+ struct Operand {
+ Value *V;
+ Domain D;
+ };
+ std::array<Operand, 2> Ops;
+
+ for (const auto I : zip(Ops, SDI->operands())) {
+ Operand &Op = std::get<0>(I);
+ Op.V = std::get<1>(I);
+ Op.D = getDomain(Op.V, LVI, SDI);
+ if (Op.D == Domain::Unknown)
+ return false;
+ }
+
+ // We know domains of both of the operands!
++NumSRems;
- auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
- SDI->getName(), SDI);
- BO->setDebugLoc(SDI->getDebugLoc());
- SDI->replaceAllUsesWith(BO);
+
+ // We need operands to be non-negative, so negate each one that isn't.
+ for (Operand &Op : Ops) {
+ if (Op.D == Domain::NonNegative)
+ continue;
+ auto *BO =
+ BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI);
+ BO->setDebugLoc(SDI->getDebugLoc());
+ Op.V = BO;
+ }
+
+ auto *URem =
+ BinaryOperator::CreateURem(Ops[0].V, Ops[1].V, SDI->getName(), SDI);
+ URem->setDebugLoc(SDI->getDebugLoc());
+
+ Value *Res = URem;
+
+ // If the divident was non-positive, we need to negate the result.
+ if (Ops[0].D == Domain::NonPositive)
+ Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+
+ SDI->replaceAllUsesWith(Res);
SDI->eraseFromParent();
- // Try to process our new urem.
- processUDivOrURem(BO, LVI);
+ // Try to simplify our new urem.
+ processUDivOrURem(URem, LVI);
return true;
}
/// See if LazyValueInfo's ability to exploit edge conditions or range
-/// information is sufficient to prove the both operands of this SDiv are
-/// positive. If this is the case, replace the SDiv with a UDiv. Even for local
+/// information is sufficient to prove the signs of both operands of this SDiv.
+/// If this is the case, replace the SDiv with a UDiv. Even for local
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
+ assert(SDI->getOpcode() == Instruction::SDiv);
+ if (SDI->getType()->isVectorTy())
return false;
+ struct Operand {
+ Value *V;
+ Domain D;
+ };
+ std::array<Operand, 2> Ops;
+
+ for (const auto I : zip(Ops, SDI->operands())) {
+ Operand &Op = std::get<0>(I);
+ Op.V = std::get<1>(I);
+ Op.D = getDomain(Op.V, LVI, SDI);
+ if (Op.D == Domain::Unknown)
+ return false;
+ }
+
+ // We know domains of both of the operands!
++NumSDivs;
- auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
- SDI->getName(), SDI);
- BO->setDebugLoc(SDI->getDebugLoc());
- BO->setIsExact(SDI->isExact());
- SDI->replaceAllUsesWith(BO);
+
+ // We need operands to be non-negative, so negate each one that isn't.
+ for (Operand &Op : Ops) {
+ if (Op.D == Domain::NonNegative)
+ continue;
+ auto *BO =
+ BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI);
+ BO->setDebugLoc(SDI->getDebugLoc());
+ Op.V = BO;
+ }
+
+ auto *UDiv =
+ BinaryOperator::CreateUDiv(Ops[0].V, Ops[1].V, SDI->getName(), SDI);
+ UDiv->setDebugLoc(SDI->getDebugLoc());
+ UDiv->setIsExact(SDI->isExact());
+
+ Value *Res = UDiv;
+
+ // If the operands had two different domains, we need to negate the result.
+ if (Ops[0].D != Ops[1].D)
+ Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+
+ SDI->replaceAllUsesWith(Res);
SDI->eraseFromParent();
// Try to simplify our new udiv.
- processUDivOrURem(BO, LVI);
+ processUDivOrURem(UDiv, LVI);
return true;
}
+static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+ assert(Instr->getOpcode() == Instruction::SDiv ||
+ Instr->getOpcode() == Instruction::SRem);
+ if (Instr->getType()->isVectorTy())
+ return false;
+
+ if (Instr->getOpcode() == Instruction::SDiv)
+ if (processSDiv(Instr, LVI))
+ return true;
+
+ if (Instr->getOpcode() == Instruction::SRem)
+ if (processSRem(Instr, LVI))
+ return true;
+
+ return narrowSDivOrSRem(Instr, LVI);
+}
+
static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy())
return false;
- Constant *Zero = ConstantInt::get(SDI->getType(), 0);
- if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
- LazyValueInfo::True)
+ if (!isNonNegative(SDI->getOperand(0), LVI, SDI))
return false;
++NumAShrs;
@@ -719,9 +827,7 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
Value *Base = SDI->getOperand(0);
- Constant *Zero = ConstantInt::get(Base->getType(), 0);
- if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) !=
- LazyValueInfo::True)
+ if (!isNonNegative(Base, LVI, SDI))
return false;
++NumSExt;
@@ -748,14 +854,12 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
if (NSW && NUW)
return false;
- BasicBlock *BB = BinOp->getParent();
-
Instruction::BinaryOps Opcode = BinOp->getOpcode();
Value *LHS = BinOp->getOperand(0);
Value *RHS = BinOp->getOperand(1);
- ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp);
- ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp);
+ ConstantRange LRange = LVI->getConstantRange(LHS, BinOp);
+ ConstantRange RRange = LVI->getConstantRange(RHS, BinOp);
bool Changed = false;
bool NewNUW = false, NewNSW = false;
@@ -783,7 +887,6 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
// Pattern match (and lhs, C) where C includes a superset of bits which might
// be set in lhs. This is a common truncation idiom created by instcombine.
- BasicBlock *BB = BinOp->getParent();
Value *LHS = BinOp->getOperand(0);
ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
if (!RHS || !RHS->getValue().isMask())
@@ -792,7 +895,7 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
// We can only replace the AND with LHS based on range info if the range does
// not include undef.
ConstantRange LRange =
- LVI->getConstantRange(LHS, BB, BinOp, /*UndefAllowed=*/false);
+ LVI->getConstantRange(LHS, BinOp, /*UndefAllowed=*/false);
if (!LRange.getUnsignedMax().ule(RHS->getValue()))
return false;
@@ -804,7 +907,7 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
- if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+ if (Constant *C = LVI->getConstant(V, At))
return C;
// TODO: The following really should be sunk inside LVI's core algorithm, or
@@ -858,10 +961,8 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
break;
case Instruction::SRem:
- BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
- break;
case Instruction::SDiv:
- BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
+ BBChanged |= processSDivOrSRem(cast<BinaryOperator>(II), LVI);
break;
case Instruction::UDiv:
case Instruction::URem:
@@ -929,11 +1030,19 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
- if (!Changed)
- return PreservedAnalyses::all();
PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LazyValueAnalysis>();
+ if (!Changed) {
+ PA = PreservedAnalyses::all();
+ } else {
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LazyValueAnalysis>();
+ }
+
+ // Keeping LVI alive is expensive, both because it uses a lot of memory, and
+ // because invalidating values in LVI is expensive. While CVP does preserve
+ // LVI, we know that passes after JumpThreading+CVP will not need the result
+ // of this analysis, so we forcefully discard it early.
+ PA.abandon<LazyValueAnalysis>();
return PA;
}
diff --git a/llvm/lib/Transforms/Scalar/DCE.cpp b/llvm/lib/Transforms/Scalar/DCE.cpp
index 28947482e303..d55adf7c2d12 100644
--- a/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -32,57 +32,10 @@ using namespace llvm;
#define DEBUG_TYPE "dce"
-STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
STATISTIC(DCEEliminated, "Number of insts removed");
DEBUG_COUNTER(DCECounter, "dce-transform",
"Controls which instructions are eliminated");
-namespace {
- //===--------------------------------------------------------------------===//
- // DeadInstElimination pass implementation
- //
-struct DeadInstElimination : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- DeadInstElimination() : FunctionPass(ID) {
- initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
-
- bool Changed = false;
- for (auto &BB : F) {
- for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
- Instruction *Inst = &*DI++;
- if (isInstructionTriviallyDead(Inst, TLI)) {
- if (!DebugCounter::shouldExecute(DCECounter))
- continue;
- salvageDebugInfo(*Inst);
- Inst->eraseFromParent();
- Changed = true;
- ++DIEEliminated;
- }
- }
- }
- return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- }
-};
-}
-
-char DeadInstElimination::ID = 0;
-INITIALIZE_PASS(DeadInstElimination, "die",
- "Dead Instruction Elimination", false, false)
-
-Pass *llvm::createDeadInstEliminationPass() {
- return new DeadInstElimination();
-}
-
//===--------------------------------------------------------------------===//
// RedundantDbgInstElimination pass implementation
//
@@ -116,6 +69,18 @@ Pass *llvm::createRedundantDbgInstEliminationPass() {
return new RedundantDbgInstElimination();
}
+PreservedAnalyses
+RedundantDbgInstEliminationPass::run(Function &F, FunctionAnalysisManager &AM) {
+ bool Changed = false;
+ for (auto &BB : F)
+ Changed |= RemoveRedundantDbgInstrs(&BB);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
//===--------------------------------------------------------------------===//
// DeadCodeElimination pass implementation
//
@@ -178,7 +143,7 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
}
PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
- if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+ if (!eliminateDeadCode(F, &AM.getResult<TargetLibraryAnalysis>(F)))
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -197,13 +162,14 @@ struct DCELegacyPass : public FunctionPass {
if (skipFunction(F))
return false;
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
return eliminateDeadCode(F, TLI);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.setPreservesCFG();
}
};
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e58db03225ee..2979225c6016 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -84,10 +84,13 @@ STATISTIC(NumFastStores, "Number of stores deleted");
STATISTIC(NumFastOther, "Number of other instrs removed");
STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
STATISTIC(NumModifiedStores, "Number of stores modified");
-STATISTIC(NumNoopStores, "Number of noop stores deleted");
STATISTIC(NumCFGChecks, "Number of stores modified");
STATISTIC(NumCFGTries, "Number of stores modified");
STATISTIC(NumCFGSuccess, "Number of stores modified");
+STATISTIC(NumGetDomMemoryDefPassed,
+ "Number of times a valid candidate is returned from getDomMemoryDef");
+STATISTIC(NumDomMemDefChecks,
+ "Number iterations check for reads in getDomMemoryDef");
DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
"Controls which MemoryDefs are eliminated.");
@@ -103,19 +106,42 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging",
cl::desc("Enable partial store merging in DSE"));
static cl::opt<bool>
- EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
+ EnableMemorySSA("enable-dse-memoryssa", cl::init(true), cl::Hidden,
cl::desc("Use the new MemorySSA-backed DSE."));
static cl::opt<unsigned>
- MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
+ MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
cl::desc("The number of memory instructions to scan for "
"dead store elimination (default = 100)"));
+static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
+ "dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
+ cl::desc("The maximum number of steps while walking upwards to find "
+ "MemoryDefs that may be killed (default = 90)"));
+
+static cl::opt<unsigned> MemorySSAPartialStoreLimit(
+ "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
+ cl::desc("The maximum number candidates that only partially overwrite the "
+ "killing MemoryDef to consider"
+ " (default = 5)"));
static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
"dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
"other stores per basic block (default = 5000)"));
+static cl::opt<unsigned> MemorySSASameBBStepCost(
+ "dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
+ cl::desc(
+ "The cost of a step in the same basic block as the killing MemoryDef"
+ "(default = 1)"));
+
+static cl::opt<unsigned>
+ MemorySSAOtherBBStepCost("dse-memoryssa-otherbb-cost", cl::init(5),
+ cl::Hidden,
+ cl::desc("The cost of a step in a different basic "
+ "block than the killing MemoryDef"
+ "(default = 5)"));
+
static cl::opt<unsigned> MemorySSAPathCheckLimit(
"dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
cl::desc("The maximum number of blocks to check when trying to prove that "
@@ -203,11 +229,13 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
case Intrinsic::memset:
case Intrinsic::memmove:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_inline:
case Intrinsic::memcpy_element_unordered_atomic:
case Intrinsic::memmove_element_unordered_atomic:
case Intrinsic::memset_element_unordered_atomic:
case Intrinsic::init_trampoline:
case Intrinsic::lifetime_end:
+ case Intrinsic::masked_store:
return true;
}
}
@@ -231,23 +259,23 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
/// Return a Location stored to by the specified instruction. If isRemovable
/// returns true, this function and getLocForRead completely describe the memory
/// operations for this instruction.
-static MemoryLocation getLocForWrite(Instruction *Inst) {
-
+static MemoryLocation getLocForWrite(Instruction *Inst,
+ const TargetLibraryInfo &TLI) {
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
return MemoryLocation::get(SI);
- if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
- // memcpy/memmove/memset.
- MemoryLocation Loc = MemoryLocation::getForDest(MI);
- return Loc;
- }
+ // memcpy/memmove/memset.
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst))
+ return MemoryLocation::getForDest(MI);
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
switch (II->getIntrinsicID()) {
default:
return MemoryLocation(); // Unhandled intrinsic.
case Intrinsic::init_trampoline:
- return MemoryLocation(II->getArgOperand(0));
+ return MemoryLocation::getAfter(II->getArgOperand(0));
+ case Intrinsic::masked_store:
+ return MemoryLocation::getForArgument(II, 1, TLI);
case Intrinsic::lifetime_end: {
uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
return MemoryLocation(II->getArgOperand(1), Len);
@@ -257,7 +285,7 @@ static MemoryLocation getLocForWrite(Instruction *Inst) {
if (auto *CB = dyn_cast<CallBase>(Inst))
// All the supported TLI functions so far happen to have dest as their
// first argument.
- return MemoryLocation(CB->getArgOperand(0));
+ return MemoryLocation::getAfter(CB->getArgOperand(0));
return MemoryLocation();
}
@@ -294,11 +322,13 @@ static bool isRemovable(Instruction *I) {
case Intrinsic::memset:
case Intrinsic::memmove:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_inline:
// Don't remove volatile memory intrinsics.
return !cast<MemIntrinsic>(II)->isVolatile();
case Intrinsic::memcpy_element_unordered_atomic:
case Intrinsic::memmove_element_unordered_atomic:
case Intrinsic::memset_element_unordered_atomic:
+ case Intrinsic::masked_store:
return true;
}
}
@@ -344,9 +374,10 @@ static bool isShortenableAtTheBeginning(Instruction *I) {
}
/// Return the pointer that is being written to.
-static Value *getStoredPointerOperand(Instruction *I) {
+static Value *getStoredPointerOperand(Instruction *I,
+ const TargetLibraryInfo &TLI) {
//TODO: factor this to reuse getLocForWrite
- MemoryLocation Loc = getLocForWrite(I);
+ MemoryLocation Loc = getLocForWrite(I, TLI);
assert(Loc.Ptr &&
"unable to find pointer written for analyzable instruction?");
// TODO: most APIs don't expect const Value *
@@ -372,31 +403,59 @@ enum OverwriteResult {
OW_Complete,
OW_End,
OW_PartialEarlierWithFullLater,
+ OW_MaybePartial,
OW_Unknown
};
} // end anonymous namespace
-/// Return 'OW_Complete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
-/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
-/// beginning of the 'Earlier' location is overwritten by 'Later'.
-/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
-/// overwritten by a latter (smaller) store which doesn't write outside the big
-/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
-static OverwriteResult isOverwrite(const MemoryLocation &Later,
- const MemoryLocation &Earlier,
- const DataLayout &DL,
- const TargetLibraryInfo &TLI,
- int64_t &EarlierOff, int64_t &LaterOff,
- Instruction *DepWrite,
- InstOverlapIntervalsTy &IOL,
- AliasAnalysis &AA,
- const Function *F) {
+/// Check if two instruction are masked stores that completely
+/// overwrite one another. More specifically, \p Later has to
+/// overwrite \p Earlier.
+template <typename AATy>
+static OverwriteResult isMaskedStoreOverwrite(const Instruction *Later,
+ const Instruction *Earlier,
+ AATy &AA) {
+ const auto *IIL = dyn_cast<IntrinsicInst>(Later);
+ const auto *IIE = dyn_cast<IntrinsicInst>(Earlier);
+ if (IIL == nullptr || IIE == nullptr)
+ return OW_Unknown;
+ if (IIL->getIntrinsicID() != Intrinsic::masked_store ||
+ IIE->getIntrinsicID() != Intrinsic::masked_store)
+ return OW_Unknown;
+ // Pointers.
+ Value *LP = IIL->getArgOperand(1)->stripPointerCasts();
+ Value *EP = IIE->getArgOperand(1)->stripPointerCasts();
+ if (LP != EP && !AA.isMustAlias(LP, EP))
+ return OW_Unknown;
+ // Masks.
+ // TODO: check that Later's mask is a superset of the Earlier's mask.
+ if (IIL->getArgOperand(3) != IIE->getArgOperand(3))
+ return OW_Unknown;
+ return OW_Complete;
+}
+
+/// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI
+/// instruction) completely overwrites a store to the 'Earlier' location.
+/// (by \p EarlierI instruction).
+/// Return OW_MaybePartial if \p Later does not completely overwrite
+/// \p Earlier, but they both write to the same underlying object. In that
+/// case, use isPartialOverwrite to check if \p Later partially overwrites
+/// \p Earlier. Returns 'OW_Unknown' if nothing can be determined.
+template <typename AATy>
+static OverwriteResult
+isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
+ const MemoryLocation &Later, const MemoryLocation &Earlier,
+ const DataLayout &DL, const TargetLibraryInfo &TLI,
+ int64_t &EarlierOff, int64_t &LaterOff, AATy &AA,
+ const Function *F) {
// FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
// get imprecise values here, though (except for unknown sizes).
- if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise())
- return OW_Unknown;
+ if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
+ // Masked stores have imprecise locations, but we can reason about them
+ // to some extent.
+ return isMaskedStoreOverwrite(LaterI, EarlierI, AA);
+ }
const uint64_t LaterSize = Later.Size.getValue();
const uint64_t EarlierSize = Earlier.Size.getValue();
@@ -415,8 +474,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// Check to see if the later store is to the entire object (either a global,
// an alloca, or a byval/inalloca argument). If so, then it clearly
// overwrites any other store to the same object.
- const Value *UO1 = GetUnderlyingObject(P1, DL),
- *UO2 = GetUnderlyingObject(P2, DL);
+ const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
// If we can't resolve the same pointers to the same object, then we can't
// analyze them at all.
@@ -441,26 +499,59 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
if (BP1 != BP2)
return OW_Unknown;
- // The later store completely overlaps the earlier store if:
- //
- // 1. Both start at the same offset and the later one's size is greater than
- // or equal to the earlier one's, or
- //
- // |--earlier--|
- // |-- later --|
- //
- // 2. The earlier store has an offset greater than the later offset, but which
- // still lies completely within the later store.
- //
- // |--earlier--|
- // |----- later ------|
+ // The later access completely overlaps the earlier store if and only if
+ // both start and end of the earlier one is "inside" the later one:
+ // |<->|--earlier--|<->|
+ // |-------later-------|
+ // Accesses may overlap if and only if start of one of them is "inside"
+ // another one:
+ // |<->|--earlier--|<----->|
+ // |-------later-------|
+ // OR
+ // |----- earlier -----|
+ // |<->|---later---|<----->|
//
// We have to be careful here as *Off is signed while *.Size is unsigned.
- if (EarlierOff >= LaterOff &&
- LaterSize >= EarlierSize &&
- uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
- return OW_Complete;
+ // Check if the earlier access starts "not before" the later one.
+ if (EarlierOff >= LaterOff) {
+ // If the earlier access ends "not after" the later access then the earlier
+ // one is completely overwritten by the later one.
+ if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
+ return OW_Complete;
+ // If start of the earlier access is "before" end of the later access then
+ // accesses overlap.
+ else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize)
+ return OW_MaybePartial;
+ }
+ // If start of the later access is "before" end of the earlier access then
+ // accesses overlap.
+ else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) {
+ return OW_MaybePartial;
+ }
+
+ // Can reach here only if accesses are known not to overlap. There is no
+ // dedicated code to indicate no overlap so signal "unknown".
+ return OW_Unknown;
+}
+
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later'.
+/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
+/// overwritten by a latter (smaller) store which doesn't write outside the big
+/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
+/// NOTE: This function must only be called if both \p Later and \p Earlier
+/// write to the same underlying object with valid \p EarlierOff and \p
+/// LaterOff.
+static OverwriteResult isPartialOverwrite(const MemoryLocation &Later,
+ const MemoryLocation &Earlier,
+ int64_t EarlierOff, int64_t LaterOff,
+ Instruction *DepWrite,
+ InstOverlapIntervalsTy &IOL) {
+ const uint64_t LaterSize = Later.Size.getValue();
+ const uint64_t EarlierSize = Earlier.Size.getValue();
// We may now overlap, although the overlap is not complete. There might also
// be other incomplete overlaps, and together, they might cover the complete
// earlier write.
@@ -627,11 +718,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
/// modified between the first and the second instruction.
/// Precondition: Second instruction must be dominated by the first
/// instruction.
-static bool memoryIsNotModifiedBetween(Instruction *FirstI,
- Instruction *SecondI,
- AliasAnalysis *AA,
- const DataLayout &DL,
- DominatorTree *DT) {
+template <typename AATy>
+static bool
+memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, AATy &AA,
+ const DataLayout &DL, DominatorTree *DT) {
// Do a backwards scan through the CFG from SecondI to FirstI. Look for
// instructions which can modify the memory location accessed by SecondI.
//
@@ -680,7 +770,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
for (; BI != EI; ++BI) {
Instruction *I = &*BI;
if (I->mayWriteToMemory() && I != SecondI)
- if (isModSet(AA->getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
+ if (isModSet(AA.getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
return false;
}
if (B != FirstBB) {
@@ -736,10 +826,9 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
MapVector<Instruction *, bool> &ThrowableInst) {
bool MadeChange = false;
- MemoryLocation Loc = MemoryLocation(F->getOperand(0));
+ MemoryLocation Loc = MemoryLocation::getAfter(F->getOperand(0));
SmallVector<BasicBlock *, 16> Blocks;
Blocks.push_back(F->getParent());
- const DataLayout &DL = F->getModule()->getDataLayout();
while (!Blocks.empty()) {
BasicBlock *BB = Blocks.pop_back_val();
@@ -755,7 +844,7 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
break;
Value *DepPointer =
- GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
+ getUnderlyingObject(getStoredPointerOperand(Dependency, *TLI));
// Check for aliasing.
if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
@@ -795,7 +884,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
const DataLayout &DL, AliasAnalysis *AA,
const TargetLibraryInfo *TLI,
const Function *F) {
- const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+ const Value *UnderlyingPointer = getUnderlyingObject(LoadedLoc.Ptr);
// A constant can't be in the dead pointer set.
if (isa<Constant>(UnderlyingPointer))
@@ -848,7 +937,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// Treat byval or inalloca arguments the same, stores to them are dead at the
// end of the function.
for (Argument &AI : BB.getParent()->args())
- if (AI.hasPassPointeeByValueAttr())
+ if (AI.hasPassPointeeByValueCopyAttr())
DeadStackObjects.insert(&AI);
const DataLayout &DL = BB.getModule()->getDataLayout();
@@ -861,7 +950,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
// See through pointer-to-pointer bitcasts
SmallVector<const Value *, 4> Pointers;
- GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
+ getUnderlyingObjects(getStoredPointerOperand(&*BBI, *TLI), Pointers);
// Stores to stack values are valid candidates for removal.
bool AllDead = true;
@@ -980,8 +1069,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
}
static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
- int64_t &EarlierSize, int64_t LaterOffset,
- int64_t LaterSize, bool IsOverwriteEnd) {
+ uint64_t &EarlierSize, int64_t LaterOffset,
+ uint64_t LaterSize, bool IsOverwriteEnd) {
// TODO: base this on the target vector size so that if the earlier
// store was too small to get vector writes anyway then its likely
// a good idea to shorten it
@@ -1036,16 +1125,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
static bool tryToShortenEnd(Instruction *EarlierWrite,
OverlapIntervalsTy &IntervalMap,
- int64_t &EarlierStart, int64_t &EarlierSize) {
+ int64_t &EarlierStart, uint64_t &EarlierSize) {
if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
return false;
OverlapIntervalsTy::iterator OII = --IntervalMap.end();
int64_t LaterStart = OII->second;
- int64_t LaterSize = OII->first - LaterStart;
+ uint64_t LaterSize = OII->first - LaterStart;
+
+ assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
- if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
- LaterStart + LaterSize >= EarlierStart + EarlierSize) {
+ if (LaterStart > EarlierStart &&
+ // Note: "LaterStart - EarlierStart" is known to be positive due to
+ // preceding check.
+ (uint64_t)(LaterStart - EarlierStart) < EarlierSize &&
+ // Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to
+ // be non negative due to preceding checks.
+ LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) {
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
LaterSize, true)) {
IntervalMap.erase(OII);
@@ -1057,16 +1153,23 @@ static bool tryToShortenEnd(Instruction *EarlierWrite,
static bool tryToShortenBegin(Instruction *EarlierWrite,
OverlapIntervalsTy &IntervalMap,
- int64_t &EarlierStart, int64_t &EarlierSize) {
+ int64_t &EarlierStart, uint64_t &EarlierSize) {
if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
return false;
OverlapIntervalsTy::iterator OII = IntervalMap.begin();
int64_t LaterStart = OII->second;
- int64_t LaterSize = OII->first - LaterStart;
+ uint64_t LaterSize = OII->first - LaterStart;
- if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
- assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
+ assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
+
+ if (LaterStart <= EarlierStart &&
+ // Note: "EarlierStart - LaterStart" is known to be non negative due to
+ // preceding check.
+ LaterSize > (uint64_t)(EarlierStart - LaterStart)) {
+ // Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be
+ // positive due to preceding checks.
+ assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize &&
"Should have been handled as OW_Complete");
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
LaterSize, false)) {
@@ -1077,18 +1180,18 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
return false;
}
-static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
- const DataLayout &DL,
- InstOverlapIntervalsTy &IOL) {
+static bool removePartiallyOverlappedStores(const DataLayout &DL,
+ InstOverlapIntervalsTy &IOL,
+ const TargetLibraryInfo &TLI) {
bool Changed = false;
for (auto OI : IOL) {
Instruction *EarlierWrite = OI.first;
- MemoryLocation Loc = getLocForWrite(EarlierWrite);
+ MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI);
assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
const Value *Ptr = Loc.Ptr->stripPointerCasts();
int64_t EarlierStart = 0;
- int64_t EarlierSize = int64_t(Loc.Size.getValue());
+ uint64_t EarlierSize = Loc.Size.getValue();
GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
OverlapIntervalsTy &IntervalMap = OI.second;
Changed |=
@@ -1118,7 +1221,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
isRemovable(SI) &&
- memoryIsNotModifiedBetween(DepLoad, SI, AA, DL, DT)) {
+ memoryIsNotModifiedBetween(DepLoad, SI, *AA, DL, DT)) {
LLVM_DEBUG(
dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
@@ -1134,10 +1237,10 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
Instruction *UnderlyingPointer =
- dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
+ dyn_cast<Instruction>(getUnderlyingObject(SI->getPointerOperand()));
if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
- memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA, DL, DT)) {
+ memoryIsNotModifiedBetween(UnderlyingPointer, SI, *AA, DL, DT)) {
LLVM_DEBUG(
dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
<< *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
@@ -1150,11 +1253,10 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
return false;
}
-static Constant *
-tryToMergePartialOverlappingStores(StoreInst *Earlier, StoreInst *Later,
- int64_t InstWriteOffset,
- int64_t DepWriteOffset, const DataLayout &DL,
- AliasAnalysis *AA, DominatorTree *DT) {
+template <typename AATy>
+static Constant *tryToMergePartialOverlappingStores(
+ StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset,
+ int64_t DepWriteOffset, const DataLayout &DL, AATy &AA, DominatorTree *DT) {
if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
@@ -1245,7 +1347,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
continue;
// Figure out what location is being stored to.
- MemoryLocation Loc = getLocForWrite(Inst);
+ MemoryLocation Loc = getLocForWrite(Inst, *TLI);
// If we didn't get a useful location, fail.
if (!Loc.Ptr)
@@ -1269,7 +1371,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
Instruction *DepWrite = InstDep.getInst();
if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
break;
- MemoryLocation DepLoc = getLocForWrite(DepWrite);
+ MemoryLocation DepLoc = getLocForWrite(DepWrite, *TLI);
// If we didn't get a useful location, or if it isn't a size, bail out.
if (!DepLoc.Ptr)
break;
@@ -1289,7 +1391,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
// to it is dead along the unwind edge. Otherwise, we need to preserve
// the store.
if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
- const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
+ const Value *Underlying = getUnderlyingObject(DepLoc.Ptr);
bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
if (!IsStoreDeadOnUnwind) {
// We're looking for a call to an allocation function
@@ -1311,9 +1413,13 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
int64_t InstWriteOffset, DepWriteOffset;
- OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset,
- InstWriteOffset, DepWrite, IOL, *AA,
+ OverwriteResult OR = isOverwrite(Inst, DepWrite, Loc, DepLoc, DL, *TLI,
+ DepWriteOffset, InstWriteOffset, *AA,
BB.getParent());
+ if (OR == OW_MaybePartial)
+ OR = isPartialOverwrite(Loc, DepLoc, DepWriteOffset, InstWriteOffset,
+ DepWrite, IOL);
+
if (OR == OW_Complete) {
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
<< "\n KILLER: " << *Inst << '\n');
@@ -1334,8 +1440,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
"when partial-overwrite "
"tracking is enabled");
// The overwrite result is known, so these must be known, too.
- int64_t EarlierSize = DepLoc.Size.getValue();
- int64_t LaterSize = Loc.Size.getValue();
+ uint64_t EarlierSize = DepLoc.Size.getValue();
+ uint64_t LaterSize = Loc.Size.getValue();
bool IsOverwriteEnd = (OR == OW_End);
MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
InstWriteOffset, LaterSize, IsOverwriteEnd);
@@ -1344,7 +1450,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
auto *Earlier = dyn_cast<StoreInst>(DepWrite);
auto *Later = dyn_cast<StoreInst>(Inst);
if (Constant *C = tryToMergePartialOverlappingStores(
- Earlier, Later, InstWriteOffset, DepWriteOffset, DL, AA,
+ Earlier, Later, InstWriteOffset, DepWriteOffset, DL, *AA,
DT)) {
auto *SI = new StoreInst(
C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
@@ -1391,7 +1497,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
}
if (EnablePartialOverwriteTracking)
- MadeChange |= removePartiallyOverlappedStores(AA, DL, IOL);
+ MadeChange |= removePartiallyOverlappedStores(DL, IOL, *TLI);
// If this block ends in a return, unwind, or unreachable, all allocas are
// dead at its end, which means stores to them are also dead.
@@ -1425,20 +1531,21 @@ namespace {
// in between both MemoryDefs. A bit more concretely:
//
// For all MemoryDefs StartDef:
-// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
+// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
// upwards.
-// 2. Check that there are no reads between DomAccess and the StartDef by
-// checking all uses starting at DomAccess and walking until we see StartDef.
-// 3. For each found DomDef, check that:
-// 1. There are no barrier instructions between DomDef and StartDef (like
+// 2. Check that there are no reads between EarlierAccess and the StartDef by
+// checking all uses starting at EarlierAccess and walking until we see
+// StartDef.
+// 3. For each found CurrentDef, check that:
+// 1. There are no barrier instructions between CurrentDef and StartDef (like
// throws or stores with ordering constraints).
-// 2. StartDef is executed whenever DomDef is executed.
-// 3. StartDef completely overwrites DomDef.
-// 4. Erase DomDef from the function and MemorySSA.
+// 2. StartDef is executed whenever CurrentDef is executed.
+// 3. StartDef completely overwrites CurrentDef.
+// 4. Erase CurrentDef from the function and MemorySSA.
-// Returns true if \p M is an intrisnic that does not read or write memory.
-bool isNoopIntrinsic(MemoryUseOrDef *M) {
- if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
+// Returns true if \p I is an intrisnic that does not read or write memory.
+bool isNoopIntrinsic(Instruction *I) {
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
@@ -1481,7 +1588,7 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
return true;
// Skip intrinsics that do not really read or modify memory.
- if (isNoopIntrinsic(D))
+ if (isNoopIntrinsic(D->getMemoryInst()))
return true;
return false;
@@ -1490,10 +1597,21 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
struct DSEState {
Function &F;
AliasAnalysis &AA;
+
+ /// The single BatchAA instance that is used to cache AA queries. It will
+ /// not be invalidated over the whole run. This is safe, because:
+ /// 1. Only memory writes are removed, so the alias cache for memory
+ /// locations remains valid.
+ /// 2. No new instructions are added (only instructions removed), so cached
+ /// information for a deleted value cannot be accessed by a re-used new
+ /// value pointer.
+ BatchAAResults BatchAA;
+
MemorySSA &MSSA;
DominatorTree &DT;
PostDominatorTree &PDT;
const TargetLibraryInfo &TLI;
+ const DataLayout &DL;
// All MemoryDefs that potentially could kill other MemDefs.
SmallVector<MemoryDef *, 64> MemDefs;
@@ -1501,10 +1619,11 @@ struct DSEState {
SmallPtrSet<MemoryAccess *, 4> SkipStores;
// Keep track of all of the objects that are invisible to the caller before
// the function returns.
- SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
+ // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
+ DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
// Keep track of all of the objects that are invisible to the caller after
// the function returns.
- SmallPtrSet<const Value *, 16> InvisibleToCallerAfterRet;
+ DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
// Keep track of blocks with throwing instructions not modeled in MemorySSA.
SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
// Post-order numbers for each basic block. Used to figure out if memory
@@ -1517,7 +1636,8 @@ struct DSEState {
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
- : F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
+ : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
+ DL(F.getParent()->getDataLayout()) {}
static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
@@ -1537,42 +1657,54 @@ struct DSEState {
if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
(State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
State.MemDefs.push_back(MD);
-
- // Track whether alloca and alloca-like objects are visible in the
- // caller before and after the function returns. Alloca objects are
- // invalid in the caller, so they are neither visible before or after
- // the function returns.
- if (isa<AllocaInst>(&I)) {
- State.InvisibleToCallerBeforeRet.insert(&I);
- State.InvisibleToCallerAfterRet.insert(&I);
- }
-
- // For alloca-like objects we need to check if they are captured before
- // the function returns and if the return might capture the object.
- if (isAllocLikeFn(&I, &TLI)) {
- bool CapturesBeforeRet = PointerMayBeCaptured(&I, false, true);
- if (!CapturesBeforeRet) {
- State.InvisibleToCallerBeforeRet.insert(&I);
- if (!PointerMayBeCaptured(&I, true, false))
- State.InvisibleToCallerAfterRet.insert(&I);
- }
- }
}
}
// Treat byval or inalloca arguments the same as Allocas, stores to them are
// dead at the end of the function.
for (Argument &AI : F.args())
- if (AI.hasPassPointeeByValueAttr()) {
+ if (AI.hasPassPointeeByValueCopyAttr()) {
// For byval, the caller doesn't know the address of the allocation.
if (AI.hasByValAttr())
- State.InvisibleToCallerBeforeRet.insert(&AI);
- State.InvisibleToCallerAfterRet.insert(&AI);
+ State.InvisibleToCallerBeforeRet.insert({&AI, true});
+ State.InvisibleToCallerAfterRet.insert({&AI, true});
}
return State;
}
+ bool isInvisibleToCallerAfterRet(const Value *V) {
+ if (isa<AllocaInst>(V))
+ return true;
+ auto I = InvisibleToCallerAfterRet.insert({V, false});
+ if (I.second) {
+ if (!isInvisibleToCallerBeforeRet(V)) {
+ I.first->second = false;
+ } else {
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (Inst && isAllocLikeFn(Inst, &TLI))
+ I.first->second = !PointerMayBeCaptured(V, true, false);
+ }
+ }
+ return I.first->second;
+ }
+
+ bool isInvisibleToCallerBeforeRet(const Value *V) {
+ if (isa<AllocaInst>(V))
+ return true;
+ auto I = InvisibleToCallerBeforeRet.insert({V, false});
+ if (I.second) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (Inst && isAllocLikeFn(Inst, &TLI))
+ // NOTE: This could be made more precise by PointerMayBeCapturedBefore
+ // with the killing MemoryDef. But we refrain from doing so for now to
+ // limit compile-time and this does not cause any changes to the number
+ // of stores removed on a large test set in practice.
+ I.first->second = !PointerMayBeCaptured(V, false, true);
+ }
+ return I.first->second;
+ }
+
Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
if (!I->mayWriteToMemory())
return None;
@@ -1581,6 +1713,11 @@ struct DSEState {
return {MemoryLocation::getForDest(MTI)};
if (auto *CB = dyn_cast<CallBase>(I)) {
+ // If the functions may write to memory we do not know about, bail out.
+ if (!CB->onlyAccessesArgMemory() &&
+ !CB->onlyAccessesInaccessibleMemOrArgMem())
+ return None;
+
LibFunc LF;
if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
switch (LF) {
@@ -1588,19 +1725,29 @@ struct DSEState {
case LibFunc_strncpy:
case LibFunc_strcat:
case LibFunc_strncat:
- return {MemoryLocation(CB->getArgOperand(0))};
+ return {MemoryLocation::getAfter(CB->getArgOperand(0))};
default:
break;
}
}
+ switch (CB->getIntrinsicID()) {
+ case Intrinsic::init_trampoline:
+ return {MemoryLocation::getAfter(CB->getArgOperand(0))};
+ case Intrinsic::masked_store:
+ return {MemoryLocation::getForArgument(CB, 1, TLI)};
+ default:
+ break;
+ }
return None;
}
return MemoryLocation::getOrNone(I);
}
- /// Returns true if \p Use completely overwrites \p DefLoc.
- bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) const {
+ /// Returns true if \p UseInst completely overwrites \p DefLoc
+ /// (stored by \p DefInst).
+ bool isCompleteOverwrite(const MemoryLocation &DefLoc, Instruction *DefInst,
+ Instruction *UseInst) {
// UseInst has a MemoryDef associated in MemorySSA. It's possible for a
// MemoryDef to not write to memory, e.g. a volatile load is modeled as a
// MemoryDef.
@@ -1612,14 +1759,10 @@ struct DSEState {
return false;
int64_t InstWriteOffset, DepWriteOffset;
- auto CC = getLocForWriteEx(UseInst);
- InstOverlapIntervalsTy IOL;
-
- const DataLayout &DL = F.getParent()->getDataLayout();
-
- return CC &&
- isOverwrite(*CC, DefLoc, DL, TLI, DepWriteOffset, InstWriteOffset,
- UseInst, IOL, AA, &F) == OW_Complete;
+ if (auto CC = getLocForWriteEx(UseInst))
+ return isOverwrite(UseInst, DefInst, *CC, DefLoc, DL, TLI, DepWriteOffset,
+ InstWriteOffset, BatchAA, &F) == OW_Complete;
+ return false;
}
/// Returns true if \p Def is not read before returning from the function.
@@ -1650,10 +1793,12 @@ struct DSEState {
}
MemoryAccess *UseAccess = WorkList[I];
- if (isa<MemoryPhi>(UseAccess)) {
- PushMemUses(UseAccess);
- continue;
- }
+ // Simply adding the users of MemoryPhi to the worklist is not enough,
+ // because we might miss read clobbers in different iterations of a loop,
+ // for example.
+ // TODO: Add support for phi translation to handle the loop case.
+ if (isa<MemoryPhi>(UseAccess))
+ return false;
// TODO: Checking for aliasing is expensive. Consider reducing the amount
// of times this is called and/or caching it.
@@ -1682,7 +1827,8 @@ struct DSEState {
if (auto *CB = dyn_cast<CallBase>(I)) {
if (isFreeCall(I, &TLI))
- return {std::make_pair(MemoryLocation(CB->getArgOperand(0)), true)};
+ return {std::make_pair(MemoryLocation::getAfter(CB->getArgOperand(0)),
+ true)};
}
return None;
@@ -1696,9 +1842,10 @@ struct DSEState {
isFreeCall(I, &TLI);
}
- /// Returns true if \p MaybeTerm is a memory terminator for the same
- /// underlying object as \p DefLoc.
- bool isMemTerminator(MemoryLocation DefLoc, Instruction *MaybeTerm) const {
+ /// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
+ /// instruction \p AccessI.
+ bool isMemTerminator(const MemoryLocation &Loc, Instruction *AccessI,
+ Instruction *MaybeTerm) {
Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
getLocForTerminator(MaybeTerm);
@@ -1707,15 +1854,31 @@ struct DSEState {
// If the terminator is a free-like call, all accesses to the underlying
// object can be considered terminated.
+ if (getUnderlyingObject(Loc.Ptr) !=
+ getUnderlyingObject(MaybeTermLoc->first.Ptr))
+ return false;
+
+ auto TermLoc = MaybeTermLoc->first;
if (MaybeTermLoc->second) {
- DataLayout DL = MaybeTerm->getParent()->getModule()->getDataLayout();
- DefLoc = MemoryLocation(GetUnderlyingObject(DefLoc.Ptr, DL));
+ const Value *LocUO = getUnderlyingObject(Loc.Ptr);
+ return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
}
- return AA.isMustAlias(MaybeTermLoc->first, DefLoc);
+ int64_t InstWriteOffset, DepWriteOffset;
+ return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DL, TLI,
+ DepWriteOffset, InstWriteOffset, BatchAA,
+ &F) == OW_Complete;
}
// Returns true if \p Use may read from \p DefLoc.
- bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) const {
+ bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) {
+ if (isNoopIntrinsic(UseInst))
+ return false;
+
+ // Monotonic or weaker atomic stores can be re-ordered and do not need to be
+ // treated as read clobber.
+ if (auto SI = dyn_cast<StoreInst>(UseInst))
+ return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic);
+
if (!UseInst->mayReadFromMemory())
return false;
@@ -1723,88 +1886,246 @@ struct DSEState {
if (CB->onlyAccessesInaccessibleMemory())
return false;
- ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
- // If necessary, perform additional analysis.
- if (isRefSet(MR))
- MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
- return isRefSet(MR);
+ // NOTE: For calls, the number of stores removed could be slightly improved
+ // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to
+ // be expensive compared to the benefits in practice. For now, avoid more
+ // expensive analysis to limit compile-time.
+ return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
+ }
+
+ /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
+ /// loop. In particular, this guarantees that it only references a single
+ /// MemoryLocation during execution of the containing function.
+ bool IsGuaranteedLoopInvariant(Value *Ptr) {
+ auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) {
+ Ptr = Ptr->stripPointerCasts();
+ if (auto *I = dyn_cast<Instruction>(Ptr)) {
+ if (isa<AllocaInst>(Ptr))
+ return true;
+
+ if (isAllocLikeFn(I, &TLI))
+ return true;
+
+ return false;
+ }
+ return true;
+ };
+
+ Ptr = Ptr->stripPointerCasts();
+ if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
+ return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) &&
+ GEP->hasAllConstantIndices();
+ }
+ return IsGuaranteedLoopInvariantBase(Ptr);
}
- // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
- // read access between them or on any other path to a function exit block if
- // \p DefLoc is not accessible after the function returns. If there is no such
- // MemoryDef, return None. The returned value may not (completely) overwrite
- // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
- // (read).
+ // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
+ // no read access between them or on any other path to a function exit block
+ // if \p DefLoc is not accessible after the function returns. If there is no
+ // such MemoryDef, return None. The returned value may not (completely)
+ // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
+ // MemoryUse (read).
Optional<MemoryAccess *>
- getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
- MemoryLocation DefLoc, bool DefVisibleToCallerBeforeRet,
- bool DefVisibleToCallerAfterRet, int &ScanLimit) const {
- MemoryAccess *DomAccess;
+ getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
+ const MemoryLocation &DefLoc, const Value *DefUO,
+ unsigned &ScanLimit, unsigned &WalkerStepLimit,
+ bool IsMemTerm, unsigned &PartialLimit) {
+ if (ScanLimit == 0 || WalkerStepLimit == 0) {
+ LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
+ return None;
+ }
+
+ MemoryAccess *Current = StartAccess;
+ Instruction *KillingI = KillingDef->getMemoryInst();
bool StepAgain;
- LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current
- << "\n");
- // Find the next clobbering Mod access for DefLoc, starting at Current.
+ LLVM_DEBUG(dbgs() << " trying to get dominating access\n");
+
+ // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
+ Optional<MemoryLocation> CurrentLoc;
do {
StepAgain = false;
+ LLVM_DEBUG({
+ dbgs() << " visiting " << *Current;
+ if (!MSSA.isLiveOnEntryDef(Current) && isa<MemoryUseOrDef>(Current))
+ dbgs() << " (" << *cast<MemoryUseOrDef>(Current)->getMemoryInst()
+ << ")";
+ dbgs() << "\n";
+ });
+
// Reached TOP.
- if (MSSA.isLiveOnEntryDef(Current))
+ if (MSSA.isLiveOnEntryDef(Current)) {
+ LLVM_DEBUG(dbgs() << " ... found LiveOnEntryDef\n");
return None;
+ }
+
+ // Cost of a step. Accesses in the same block are more likely to be valid
+ // candidates for elimination, hence consider them cheaper.
+ unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
+ ? MemorySSASameBBStepCost
+ : MemorySSAOtherBBStepCost;
+ if (WalkerStepLimit <= StepCost) {
+ LLVM_DEBUG(dbgs() << " ... hit walker step limit\n");
+ return None;
+ }
+ WalkerStepLimit -= StepCost;
+ // Return for MemoryPhis. They cannot be eliminated directly and the
+ // caller is responsible for traversing them.
if (isa<MemoryPhi>(Current)) {
- DomAccess = Current;
- break;
+ LLVM_DEBUG(dbgs() << " ... found MemoryPhi\n");
+ return Current;
+ }
+
+ // Below, check if CurrentDef is a valid candidate to be eliminated by
+ // KillingDef. If it is not, check the next candidate.
+ MemoryDef *CurrentDef = cast<MemoryDef>(Current);
+ Instruction *CurrentI = CurrentDef->getMemoryInst();
+
+ if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+ StepAgain = true;
+ Current = CurrentDef->getDefiningAccess();
+ continue;
}
- MemoryUseOrDef *CurrentUD = cast<MemoryUseOrDef>(Current);
- // Look for access that clobber DefLoc.
- DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD,
- DefLoc);
- if (MSSA.isLiveOnEntryDef(DomAccess))
+
+ // Before we try to remove anything, check for any extra throwing
+ // instructions that block us from DSEing
+ if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
+ LLVM_DEBUG(dbgs() << " ... skip, may throw!\n");
return None;
+ }
- if (isa<MemoryPhi>(DomAccess))
- break;
+ // Check for anything that looks like it will be a barrier to further
+ // removal
+ if (isDSEBarrier(DefUO, CurrentI)) {
+ LLVM_DEBUG(dbgs() << " ... skip, barrier\n");
+ return None;
+ }
+
+ // If Current is known to be on path that reads DefLoc or is a read
+ // clobber, bail out, as the path is not profitable. We skip this check
+ // for intrinsic calls, because the code knows how to handle memcpy
+ // intrinsics.
+ if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(DefLoc, CurrentI))
+ return None;
+
+ // Quick check if there are direct uses that are read-clobbers.
+ if (any_of(Current->uses(), [this, &DefLoc, StartAccess](Use &U) {
+ if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(U.getUser()))
+ return !MSSA.dominates(StartAccess, UseOrDef) &&
+ isReadClobber(DefLoc, UseOrDef->getMemoryInst());
+ return false;
+ })) {
+ LLVM_DEBUG(dbgs() << " ... found a read clobber\n");
+ return None;
+ }
+
+ // If Current cannot be analyzed or is not removable, check the next
+ // candidate.
+ if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
+ StepAgain = true;
+ Current = CurrentDef->getDefiningAccess();
+ continue;
+ }
+
+ // If Current does not have an analyzable write location, skip it
+ CurrentLoc = getLocForWriteEx(CurrentI);
+ if (!CurrentLoc) {
+ StepAgain = true;
+ Current = CurrentDef->getDefiningAccess();
+ continue;
+ }
- // Check if we can skip DomDef for DSE.
- MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
- if (DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) {
+ // AliasAnalysis does not account for loops. Limit elimination to
+ // candidates for which we can guarantee they always store to the same
+ // memory location and not multiple locations in a loop.
+ if (Current->getBlock() != KillingDef->getBlock() &&
+ !IsGuaranteedLoopInvariant(const_cast<Value *>(CurrentLoc->Ptr))) {
StepAgain = true;
- Current = DomDef->getDefiningAccess();
+ Current = CurrentDef->getDefiningAccess();
+ WalkerStepLimit -= 1;
+ continue;
}
+ if (IsMemTerm) {
+ // If the killing def is a memory terminator (e.g. lifetime.end), check
+ // the next candidate if the current Current does not write the same
+ // underlying object as the terminator.
+ if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) {
+ StepAgain = true;
+ Current = CurrentDef->getDefiningAccess();
+ }
+ continue;
+ } else {
+ int64_t InstWriteOffset, DepWriteOffset;
+ auto OR = isOverwrite(KillingI, CurrentI, DefLoc, *CurrentLoc, DL, TLI,
+ DepWriteOffset, InstWriteOffset, BatchAA, &F);
+ // If Current does not write to the same object as KillingDef, check
+ // the next candidate.
+ if (OR == OW_Unknown) {
+ StepAgain = true;
+ Current = CurrentDef->getDefiningAccess();
+ } else if (OR == OW_MaybePartial) {
+ // If KillingDef only partially overwrites Current, check the next
+ // candidate if the partial step limit is exceeded. This aggressively
+ // limits the number of candidates for partial store elimination,
+ // which are less likely to be removable in the end.
+ if (PartialLimit <= 1) {
+ StepAgain = true;
+ Current = CurrentDef->getDefiningAccess();
+ WalkerStepLimit -= 1;
+ continue;
+ }
+ PartialLimit -= 1;
+ }
+ }
} while (StepAgain);
// Accesses to objects accessible after the function returns can only be
// eliminated if the access is killed along all paths to the exit. Collect
// the blocks with killing (=completely overwriting MemoryDefs) and check if
- // they cover all paths from DomAccess to any function exit.
- SmallPtrSet<BasicBlock *, 16> KillingBlocks = {KillingDef->getBlock()};
- LLVM_DEBUG({
- dbgs() << " Checking for reads of " << *DomAccess;
- if (isa<MemoryDef>(DomAccess))
- dbgs() << " (" << *cast<MemoryDef>(DomAccess)->getMemoryInst() << ")\n";
- else
- dbgs() << ")\n";
- });
+ // they cover all paths from EarlierAccess to any function exit.
+ SmallPtrSet<Instruction *, 16> KillingDefs;
+ KillingDefs.insert(KillingDef->getMemoryInst());
+ MemoryAccess *EarlierAccess = Current;
+ Instruction *EarlierMemInst =
+ cast<MemoryDef>(EarlierAccess)->getMemoryInst();
+ LLVM_DEBUG(dbgs() << " Checking for reads of " << *EarlierAccess << " ("
+ << *EarlierMemInst << ")\n");
SmallSetVector<MemoryAccess *, 32> WorkList;
auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
for (Use &U : Acc->uses())
WorkList.insert(cast<MemoryAccess>(U.getUser()));
};
- PushMemUses(DomAccess);
-
- // Check if DomDef may be read.
+ PushMemUses(EarlierAccess);
+
+ // Optimistically collect all accesses for reads. If we do not find any
+ // read clobbers, add them to the cache.
+ SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
+ if (!EarlierMemInst->mayReadFromMemory())
+ KnownNoReads.insert(EarlierAccess);
+ // Check if EarlierDef may be read.
for (unsigned I = 0; I < WorkList.size(); I++) {
MemoryAccess *UseAccess = WorkList[I];
LLVM_DEBUG(dbgs() << " " << *UseAccess);
- if (--ScanLimit == 0) {
+ // Bail out if the number of accesses to check exceeds the scan limit.
+ if (ScanLimit < (WorkList.size() - I)) {
LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
return None;
}
+ --ScanLimit;
+ NumDomMemDefChecks++;
+ KnownNoReads.insert(UseAccess);
if (isa<MemoryPhi>(UseAccess)) {
+ if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
+ return DT.properlyDominates(KI->getParent(),
+ UseAccess->getBlock());
+ })) {
+ LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing block\n");
+ continue;
+ }
LLVM_DEBUG(dbgs() << "\n ... adding PHI uses\n");
PushMemUses(UseAccess);
continue;
@@ -1813,29 +2134,45 @@ struct DSEState {
Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
- if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
- LLVM_DEBUG(dbgs() << " ... adding uses of intrinsic\n");
- PushMemUses(UseAccess);
+ if (any_of(KillingDefs, [this, UseInst](Instruction *KI) {
+ return DT.dominates(KI, UseInst);
+ })) {
+ LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing def\n");
continue;
}
// A memory terminator kills all preceeding MemoryDefs and all succeeding
// MemoryAccesses. We do not have to check it's users.
- if (isMemTerminator(DefLoc, UseInst))
+ if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) {
+ LLVM_DEBUG(
+ dbgs()
+ << " ... skipping, memterminator invalidates following accesses\n");
continue;
+ }
+
+ if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) {
+ LLVM_DEBUG(dbgs() << " ... adding uses of intrinsic\n");
+ PushMemUses(UseAccess);
+ continue;
+ }
+
+ if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(DefUO)) {
+ LLVM_DEBUG(dbgs() << " ... found throwing instruction\n");
+ return None;
+ }
// Uses which may read the original MemoryDef mean we cannot eliminate the
// original MD. Stop walk.
- if (isReadClobber(DefLoc, UseInst)) {
+ if (isReadClobber(*CurrentLoc, UseInst)) {
LLVM_DEBUG(dbgs() << " ... found read clobber\n");
return None;
}
- // For the KillingDef and DomAccess we only have to check if it reads the
- // memory location.
+ // For the KillingDef and EarlierAccess we only have to check if it reads
+ // the memory location.
// TODO: It would probably be better to check for self-reads before
// calling the function.
- if (KillingDef == UseAccess || DomAccess == UseAccess) {
+ if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n");
continue;
}
@@ -1844,22 +2181,23 @@ struct DSEState {
// the original location. Otherwise we have to check uses of *all*
// MemoryDefs we discover, including non-aliasing ones. Otherwise we might
// miss cases like the following
- // 1 = Def(LoE) ; <----- DomDef stores [0,1]
+ // 1 = Def(LoE) ; <----- EarlierDef stores [0,1]
// 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3]
// Use(2) ; MayAlias 2 *and* 1, loads [0, 3].
// (The Use points to the *first* Def it may alias)
// 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias,
// stores [0,1]
if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
- if (isCompleteOverwrite(DefLoc, UseInst)) {
- if (DefVisibleToCallerAfterRet && UseAccess != DomAccess) {
+ if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
+ if (!isInvisibleToCallerAfterRet(DefUO) &&
+ UseAccess != EarlierAccess) {
BasicBlock *MaybeKillingBlock = UseInst->getParent();
if (PostOrderNumbers.find(MaybeKillingBlock)->second <
- PostOrderNumbers.find(DomAccess->getBlock())->second) {
+ PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
- LLVM_DEBUG(dbgs() << " ... found killing block "
- << MaybeKillingBlock->getName() << "\n");
- KillingBlocks.insert(MaybeKillingBlock);
+ LLVM_DEBUG(dbgs()
+ << " ... found killing def " << *UseInst << "\n");
+ KillingDefs.insert(UseInst);
}
}
} else
@@ -1868,11 +2206,15 @@ struct DSEState {
}
// For accesses to locations visible after the function returns, make sure
- // that the location is killed (=overwritten) along all paths from DomAccess
- // to the exit.
- if (DefVisibleToCallerAfterRet) {
+ // that the location is killed (=overwritten) along all paths from
+ // EarlierAccess to the exit.
+ if (!isInvisibleToCallerAfterRet(DefUO)) {
+ SmallPtrSet<BasicBlock *, 16> KillingBlocks;
+ for (Instruction *KD : KillingDefs)
+ KillingBlocks.insert(KD->getParent());
assert(!KillingBlocks.empty() &&
"Expected at least a single killing block");
+
// Find the common post-dominator of all killing blocks.
BasicBlock *CommonPred = *KillingBlocks.begin();
for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
@@ -1883,23 +2225,19 @@ struct DSEState {
}
// If CommonPred is in the set of killing blocks, just check if it
- // post-dominates DomAccess.
+ // post-dominates EarlierAccess.
if (KillingBlocks.count(CommonPred)) {
- if (PDT.dominates(CommonPred, DomAccess->getBlock()))
- return {DomAccess};
+ if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
+ return {EarlierAccess};
return None;
}
- // If the common post-dominator does not post-dominate DomAccess, there
- // is a path from DomAccess to an exit not going through a killing block.
- if (PDT.dominates(CommonPred, DomAccess->getBlock())) {
+ // If the common post-dominator does not post-dominate EarlierAccess,
+ // there is a path from EarlierAccess to an exit not going through a
+ // killing block.
+ if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
SetVector<BasicBlock *> WorkList;
- // DomAccess's post-order number provides an upper bound of the blocks
- // on a path starting at DomAccess.
- unsigned UpperBound =
- PostOrderNumbers.find(DomAccess->getBlock())->second;
-
// If CommonPred is null, there are multiple exits from the function.
// They all have to be added to the worklist.
if (CommonPred)
@@ -1910,24 +2248,20 @@ struct DSEState {
NumCFGTries++;
// Check if all paths starting from an exit node go through one of the
- // killing blocks before reaching DomAccess.
+ // killing blocks before reaching EarlierAccess.
for (unsigned I = 0; I < WorkList.size(); I++) {
NumCFGChecks++;
BasicBlock *Current = WorkList[I];
if (KillingBlocks.count(Current))
continue;
- if (Current == DomAccess->getBlock())
+ if (Current == EarlierAccess->getBlock())
return None;
- // DomAccess is reachable from the entry, so we don't have to explore
- // unreachable blocks further.
+ // EarlierAccess is reachable from the entry, so we don't have to
+ // explore unreachable blocks further.
if (!DT.isReachableFromEntry(Current))
continue;
- unsigned CPO = PostOrderNumbers.find(Current)->second;
- // Current block is not on a path starting at DomAccess.
- if (CPO > UpperBound)
- continue;
for (BasicBlock *Pred : predecessors(Current))
WorkList.insert(Pred);
@@ -1935,13 +2269,14 @@ struct DSEState {
return None;
}
NumCFGSuccess++;
- return {DomAccess};
+ return {EarlierAccess};
}
return None;
}
- // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead.
- return {DomAccess};
+ // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
+ // potentially dead.
+ return {EarlierAccess};
}
// Delete dead memory defs
@@ -1986,11 +2321,11 @@ struct DSEState {
// checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
// throw are handled during the walk from one def to the next.
bool mayThrowBetween(Instruction *SI, Instruction *NI,
- const Value *SILocUnd) const {
+ const Value *SILocUnd) {
// First see if we can ignore it by using the fact that SI is an
// alloca/alloca like object that is not visible to the caller during
// execution of the function.
- if (SILocUnd && InvisibleToCallerBeforeRet.count(SILocUnd))
+ if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd))
return false;
if (SI->getParent() == NI->getParent())
@@ -2003,10 +2338,10 @@ struct DSEState {
// * A memory instruction that may throw and \p SI accesses a non-stack
// object.
// * Atomic stores stronger that monotonic.
- bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) const {
+ bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) {
// If NI may throw it acts as a barrier, unless we are to an alloca/alloca
// like object that does not escape.
- if (NI->mayThrow() && !InvisibleToCallerBeforeRet.count(SILocUnd))
+ if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd))
return true;
// If NI is an atomic load/store stronger than monotonic, do not try to
@@ -2016,6 +2351,11 @@ struct DSEState {
return isStrongerThanMonotonic(LI->getOrdering());
if (auto *SI = dyn_cast<StoreInst>(NI))
return isStrongerThanMonotonic(SI->getOrdering());
+ if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI))
+ return isStrongerThanMonotonic(ARMW->getOrdering());
+ if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI))
+ return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
+ isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
llvm_unreachable("other instructions should be skipped in MemorySSA");
}
return false;
@@ -2024,40 +2364,37 @@ struct DSEState {
/// Eliminate writes to objects that are not visible in the caller and are not
/// accessed before returning from the function.
bool eliminateDeadWritesAtEndOfFunction() {
- const DataLayout &DL = F.getParent()->getDataLayout();
bool MadeChange = false;
LLVM_DEBUG(
dbgs()
<< "Trying to eliminate MemoryDefs at the end of the function\n");
for (int I = MemDefs.size() - 1; I >= 0; I--) {
MemoryDef *Def = MemDefs[I];
- if (SkipStores.find(Def) != SkipStores.end() ||
- !isRemovable(Def->getMemoryInst()))
+ if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
+ continue;
+
+ Instruction *DefI = Def->getMemoryInst();
+ SmallVector<const Value *, 4> Pointers;
+ auto DefLoc = getLocForWriteEx(DefI);
+ if (!DefLoc)
+ continue;
+
+ // NOTE: Currently eliminating writes at the end of a function is limited
+ // to MemoryDefs with a single underlying object, to save compile-time. In
+ // practice it appears the case with multiple underlying objects is very
+ // uncommon. If it turns out to be important, we can use
+ // getUnderlyingObjects here instead.
+ const Value *UO = getUnderlyingObject(DefLoc->Ptr);
+ if (!UO || !isInvisibleToCallerAfterRet(UO))
continue;
- // TODO: Consider doing the underlying object check first, if it is
- // beneficial compile-time wise.
if (isWriteAtEndOfFunction(Def)) {
- Instruction *DefI = Def->getMemoryInst();
// See through pointer-to-pointer bitcasts
- SmallVector<const Value *, 4> Pointers;
- GetUnderlyingObjects(getLocForWriteEx(DefI)->Ptr, Pointers, DL);
-
LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end "
"of the function\n");
- bool CanKill = true;
- for (const Value *Pointer : Pointers) {
- if (!InvisibleToCallerAfterRet.count(Pointer)) {
- CanKill = false;
- break;
- }
- }
-
- if (CanKill) {
- deleteDeadInstruction(DefI);
- ++NumFastStores;
- MadeChange = true;
- }
+ deleteDeadInstruction(DefI);
+ ++NumFastStores;
+ MadeChange = true;
}
}
return MadeChange;
@@ -2065,17 +2402,53 @@ struct DSEState {
/// \returns true if \p Def is a no-op store, either because it
/// directly stores back a loaded value or stores zero to a calloced object.
- bool storeIsNoop(MemoryDef *Def, MemoryLocation DefLoc, const Value *DefUO) {
+ bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc,
+ const Value *DefUO) {
StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
if (!Store)
return false;
if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
if (LoadI->getPointerOperand() == Store->getOperand(1)) {
+ // Get the defining access for the load.
auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
- // If both accesses share the same defining access, no instructions
- // between them can modify the memory location.
- return LoadAccess == Def->getDefiningAccess();
+ // Fast path: the defining accesses are the same.
+ if (LoadAccess == Def->getDefiningAccess())
+ return true;
+
+ // Look through phi accesses. Recursively scan all phi accesses by
+ // adding them to a worklist. Bail when we run into a memory def that
+ // does not match LoadAccess.
+ SetVector<MemoryAccess *> ToCheck;
+ MemoryAccess *Current =
+ MSSA.getWalker()->getClobberingMemoryAccess(Def);
+ // We don't want to bail when we run into the store memory def. But,
+ // the phi access may point to it. So, pretend like we've already
+ // checked it.
+ ToCheck.insert(Def);
+ ToCheck.insert(Current);
+ // Start at current (1) to simulate already having checked Def.
+ for (unsigned I = 1; I < ToCheck.size(); ++I) {
+ Current = ToCheck[I];
+ if (auto PhiAccess = dyn_cast<MemoryPhi>(Current)) {
+ // Check all the operands.
+ for (auto &Use : PhiAccess->incoming_values())
+ ToCheck.insert(cast<MemoryAccess>(&Use));
+ continue;
+ }
+
+ // If we found a memory def, bail. This happens when we have an
+ // unrelated write in between an otherwise noop store.
+ assert(isa<MemoryDef>(Current) &&
+ "Only MemoryDefs should reach here.");
+ // TODO: Skip no alias MemoryDefs that have no aliasing reads.
+ // We are searching for the definition of the store's destination.
+ // So, if that is the same definition as the load, then this is a
+ // noop. Otherwise, fail.
+ if (LoadAccess != Current)
+ return false;
+ }
+ return true;
}
}
@@ -2099,7 +2472,6 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT,
const TargetLibraryInfo &TLI) {
- const DataLayout &DL = F.getParent()->getDataLayout();
bool MadeChange = false;
DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
@@ -2110,7 +2482,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
continue;
Instruction *SI = KillingDef->getMemoryInst();
- auto MaybeSILoc = State.getLocForWriteEx(SI);
+ Optional<MemoryLocation> MaybeSILoc;
if (State.isMemTerminatorInst(SI))
MaybeSILoc = State.getLocForTerminator(SI).map(
[](const std::pair<MemoryLocation, bool> &P) { return P.first; });
@@ -2124,38 +2496,23 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
}
MemoryLocation SILoc = *MaybeSILoc;
assert(SILoc.Ptr && "SILoc should not be null");
- const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
-
- // Check if the store is a no-op.
- if (isRemovable(SI) && State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
- LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *SI << '\n');
- State.deleteDeadInstruction(SI);
- NumNoopStores++;
- MadeChange = true;
- continue;
- }
-
- Instruction *DefObj =
- const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
- bool DefVisibleToCallerBeforeRet =
- !State.InvisibleToCallerBeforeRet.count(SILocUnd);
- bool DefVisibleToCallerAfterRet =
- !State.InvisibleToCallerAfterRet.count(SILocUnd);
- if (DefObj && isAllocLikeFn(DefObj, &TLI)) {
- if (DefVisibleToCallerBeforeRet)
- DefVisibleToCallerBeforeRet =
- PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT);
- }
+ const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr);
MemoryAccess *Current = KillingDef;
LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
<< *KillingDef << " (" << *SI << ")\n");
- int ScanLimit = MemorySSAScanLimit;
+ unsigned ScanLimit = MemorySSAScanLimit;
+ unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+ unsigned PartialLimit = MemorySSAPartialStoreLimit;
// Worklist of MemoryAccesses that may be killed by KillingDef.
SetVector<MemoryAccess *> ToCheck;
- ToCheck.insert(KillingDef->getDefiningAccess());
+ if (SILocUnd)
+ ToCheck.insert(KillingDef->getDefiningAccess());
+
+ bool Shortend = false;
+ bool IsMemTerm = State.isMemTerminatorInst(SI);
// Check if MemoryAccesses in the worklist are killed by KillingDef.
for (unsigned I = 0; I < ToCheck.size(); I++) {
Current = ToCheck[I];
@@ -2163,22 +2520,22 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
continue;
Optional<MemoryAccess *> Next = State.getDomMemoryDef(
- KillingDef, Current, SILoc, DefVisibleToCallerBeforeRet,
- DefVisibleToCallerAfterRet, ScanLimit);
+ KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit,
+ IsMemTerm, PartialLimit);
if (!Next) {
LLVM_DEBUG(dbgs() << " finished walk\n");
continue;
}
- MemoryAccess *DomAccess = *Next;
- LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess);
- if (isa<MemoryPhi>(DomAccess)) {
+ MemoryAccess *EarlierAccess = *Next;
+ LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
+ if (isa<MemoryPhi>(EarlierAccess)) {
LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n");
- for (Value *V : cast<MemoryPhi>(DomAccess)->incoming_values()) {
+ for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
BasicBlock *IncomingBlock = IncomingAccess->getBlock();
- BasicBlock *PhiBlock = DomAccess->getBlock();
+ BasicBlock *PhiBlock = EarlierAccess->getBlock();
// We only consider incoming MemoryAccesses that come before the
// MemoryPhi. Otherwise we could discover candidates that do not
@@ -2189,44 +2546,20 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
}
continue;
}
- MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
+ auto *NextDef = cast<MemoryDef>(EarlierAccess);
Instruction *NI = NextDef->getMemoryInst();
LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-
- // Before we try to remove anything, check for any extra throwing
- // instructions that block us from DSEing
- if (State.mayThrowBetween(SI, NI, SILocUnd)) {
- LLVM_DEBUG(dbgs() << " ... skip, may throw!\n");
- break;
- }
-
- // Check for anything that looks like it will be a barrier to further
- // removal
- if (State.isDSEBarrier(SILocUnd, NI)) {
- LLVM_DEBUG(dbgs() << " ... skip, barrier\n");
- continue;
- }
-
ToCheck.insert(NextDef->getDefiningAccess());
-
- if (!hasAnalyzableMemoryWrite(NI, TLI)) {
- LLVM_DEBUG(dbgs() << " ... skip, cannot analyze def\n");
- continue;
- }
-
- if (!isRemovable(NI)) {
- LLVM_DEBUG(dbgs() << " ... skip, cannot remove def\n");
- continue;
- }
+ NumGetDomMemoryDefPassed++;
if (!DebugCounter::shouldExecute(MemorySSACounter))
continue;
MemoryLocation NILoc = *State.getLocForWriteEx(NI);
- if (State.isMemTerminatorInst(SI)) {
- const Value *NIUnd = GetUnderlyingObject(NILoc.Ptr, DL);
- if (!SILocUnd || SILocUnd != NIUnd)
+ if (IsMemTerm) {
+ const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
+ if (SILocUnd != NIUnd)
continue;
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
<< "\n KILLER: " << *SI << '\n');
@@ -2236,32 +2569,43 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
} else {
// Check if NI overwrites SI.
int64_t InstWriteOffset, DepWriteOffset;
- auto Iter = State.IOLs.insert(
- std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
- NI->getParent(), InstOverlapIntervalsTy()));
- auto &IOL = Iter.first->second;
- OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
- InstWriteOffset, NI, IOL, AA, &F);
+ OverwriteResult OR =
+ isOverwrite(SI, NI, SILoc, NILoc, State.DL, TLI, DepWriteOffset,
+ InstWriteOffset, State.BatchAA, &F);
+ if (OR == OW_MaybePartial) {
+ auto Iter = State.IOLs.insert(
+ std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
+ NI->getParent(), InstOverlapIntervalsTy()));
+ auto &IOL = Iter.first->second;
+ OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset,
+ NI, IOL);
+ }
if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
auto *Earlier = dyn_cast<StoreInst>(NI);
auto *Later = dyn_cast<StoreInst>(SI);
- if (Constant *Merged = tryToMergePartialOverlappingStores(
- Earlier, Later, InstWriteOffset, DepWriteOffset, DL, &AA,
- &DT)) {
-
- // Update stored value of earlier store to merged constant.
- Earlier->setOperand(0, Merged);
- ++NumModifiedStores;
- MadeChange = true;
-
- // Remove later store and remove any outstanding overlap intervals
- // for the updated store.
- State.deleteDeadInstruction(Later);
- auto I = State.IOLs.find(Earlier->getParent());
- if (I != State.IOLs.end())
- I->second.erase(Earlier);
- break;
+ // We are re-using tryToMergePartialOverlappingStores, which requires
+ // Earlier to domiante Later.
+ // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
+ if (Earlier && Later && DT.dominates(Earlier, Later)) {
+ if (Constant *Merged = tryToMergePartialOverlappingStores(
+ Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL,
+ State.BatchAA, &DT)) {
+
+ // Update stored value of earlier store to merged constant.
+ Earlier->setOperand(0, Merged);
+ ++NumModifiedStores;
+ MadeChange = true;
+
+ Shortend = true;
+ // Remove later store and remove any outstanding overlap intervals
+ // for the updated store.
+ State.deleteDeadInstruction(Later);
+ auto I = State.IOLs.find(Earlier->getParent());
+ if (I != State.IOLs.end())
+ I->second.erase(Earlier);
+ break;
+ }
}
}
@@ -2274,11 +2618,21 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
}
}
}
+
+ // Check if the store is a no-op.
+ if (!Shortend && isRemovable(SI) &&
+ State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
+ LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *SI << '\n');
+ State.deleteDeadInstruction(SI);
+ NumRedundantStores++;
+ MadeChange = true;
+ continue;
+ }
}
if (EnablePartialOverwriteTracking)
for (auto &KV : State.IOLs)
- MadeChange |= removePartiallyOverlappedStores(&AA, DL, KV.second);
+ MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI);
MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
return MadeChange;
diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index d44a5979a8b2..3c6c444d6649 100644
--- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -151,8 +151,8 @@ static DivRemWorklistTy getWorklist(Function &F) {
// rare than division.
for (auto &RemPair : RemMap) {
// Find the matching division instruction from the division map.
- Instruction *DivInst = DivMap[RemPair.first];
- if (!DivInst)
+ auto It = DivMap.find(RemPair.first);
+ if (It == DivMap.end())
continue;
// We have a matching pair of div/rem instructions.
@@ -160,7 +160,7 @@ static DivRemWorklistTy getWorklist(Function &F) {
Instruction *RemInst = RemPair.second;
// Place it in the worklist.
- Worklist.emplace_back(DivInst, RemInst);
+ Worklist.emplace_back(It->second, RemInst);
}
return Worklist;
@@ -315,14 +315,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
// %rem = sub %x, %mul // %rem = undef - undef = undef
// If X is not frozen, %rem becomes undef after transformation.
// TODO: We need a undef-specific checking function in ValueTracking
- if (!isGuaranteedNotToBeUndefOrPoison(X, DivInst, &DT)) {
+ if (!isGuaranteedNotToBeUndefOrPoison(X, nullptr, DivInst, &DT)) {
auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst);
DivInst->setOperand(0, FrX);
Sub->setOperand(0, FrX);
}
// Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0,
// but %rem in tgt can be one of many integer values.
- if (!isGuaranteedNotToBeUndefOrPoison(Y, DivInst, &DT)) {
+ if (!isGuaranteedNotToBeUndefOrPoison(Y, nullptr, DivInst, &DT)) {
auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst);
DivInst->setOperand(1, FrY);
Mul->setOperand(1, FrY);
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index ddfc8555b0a0..180a82917fa9 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -154,33 +154,13 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
std::swap(A, B);
}
- // Match canonical forms of abs/nabs/min/max. We are not using ValueTracking's
+ // Match canonical forms of min/max. We are not using ValueTracking's
// more powerful matchSelectPattern() because it may rely on instruction flags
// such as "nsw". That would be incompatible with the current hashing
// mechanism that may remove flags to increase the likelihood of CSE.
- // These are the canonical forms of abs(X) and nabs(X) created by instcombine:
- // %N = sub i32 0, %X
- // %C = icmp slt i32 %X, 0
- // %ABS = select i1 %C, i32 %N, i32 %X
- //
- // %N = sub i32 0, %X
- // %C = icmp slt i32 %X, 0
- // %NABS = select i1 %C, i32 %X, i32 %N
Flavor = SPF_UNKNOWN;
CmpInst::Predicate Pred;
- if (match(Cond, m_ICmp(Pred, m_Specific(B), m_ZeroInt())) &&
- Pred == ICmpInst::ICMP_SLT && match(A, m_Neg(m_Specific(B)))) {
- // ABS: B < 0 ? -B : B
- Flavor = SPF_ABS;
- return true;
- }
- if (match(Cond, m_ICmp(Pred, m_Specific(A), m_ZeroInt())) &&
- Pred == ICmpInst::ICMP_SLT && match(B, m_Neg(m_Specific(A)))) {
- // NABS: A < 0 ? A : -A
- Flavor = SPF_NABS;
- return true;
- }
if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
// Check for commuted variants of min/max by swapping predicate.
@@ -196,6 +176,11 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
+ // Non-strict inequalities.
+ case CmpInst::ICMP_ULE: Flavor = SPF_UMIN; break;
+ case CmpInst::ICMP_UGE: Flavor = SPF_UMAX; break;
+ case CmpInst::ICMP_SLE: Flavor = SPF_SMIN; break;
+ case CmpInst::ICMP_SGE: Flavor = SPF_SMAX; break;
default: break;
}
@@ -234,7 +219,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
SelectPatternFlavor SPF;
Value *Cond, *A, *B;
if (matchSelectWithOptionalNotCond(Inst, Cond, A, B, SPF)) {
- // Hash min/max/abs (cmp + select) to allow for commuted operands.
+ // Hash min/max (cmp + select) to allow for commuted operands.
// Min/max may also have non-canonical compare predicate (eg, the compare for
// smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
// compare.
@@ -245,10 +230,6 @@ static unsigned getHashValueImpl(SimpleValue Val) {
std::swap(A, B);
return hash_combine(Inst->getOpcode(), SPF, A, B);
}
- if (SPF == SPF_ABS || SPF == SPF_NABS) {
- // ABS/NABS always puts the input in A and its negation in B.
- return hash_combine(Inst->getOpcode(), SPF, A, B);
- }
// Hash general selects to allow matching commuted true/false operands.
@@ -288,6 +269,17 @@ static unsigned getHashValueImpl(SimpleValue Val) {
isa<FreezeInst>(Inst)) &&
"Invalid/unknown instruction");
+ // Handle intrinsics with commutative operands.
+ // TODO: Extend this to handle intrinsics with >2 operands where the 1st
+ // 2 operands are commutative.
+ auto *II = dyn_cast<IntrinsicInst>(Inst);
+ if (II && II->isCommutative() && II->getNumArgOperands() == 2) {
+ Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+ if (LHS > RHS)
+ std::swap(LHS, RHS);
+ return hash_combine(II->getOpcode(), LHS, RHS);
+ }
+
// Mix in the opcode.
return hash_combine(
Inst->getOpcode(),
@@ -340,7 +332,16 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
}
- // Min/max/abs can occur with commuted operands, non-canonical predicates,
+ // TODO: Extend this for >2 args by matching the trailing N-2 args.
+ auto *LII = dyn_cast<IntrinsicInst>(LHSI);
+ auto *RII = dyn_cast<IntrinsicInst>(RHSI);
+ if (LII && RII && LII->getIntrinsicID() == RII->getIntrinsicID() &&
+ LII->isCommutative() && LII->getNumArgOperands() == 2) {
+ return LII->getArgOperand(0) == RII->getArgOperand(1) &&
+ LII->getArgOperand(1) == RII->getArgOperand(0);
+ }
+
+ // Min/max can occur with commuted operands, non-canonical predicates,
// and/or non-canonical operands.
// Selects can be non-trivially equivalent via inverted conditions and swaps.
SelectPatternFlavor LSPF, RSPF;
@@ -354,11 +355,6 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
return ((LHSA == RHSA && LHSB == RHSB) ||
(LHSA == RHSB && LHSB == RHSA));
- if (LSPF == SPF_ABS || LSPF == SPF_NABS) {
- // Abs results are placed in a defined order by matchSelectPattern.
- return LHSA == RHSA && LHSB == RHSB;
- }
-
// select Cond, A, B <--> select not(Cond), B, A
if (CondL == CondR && LHSA == RHSA && LHSB == RHSB)
return true;
@@ -376,7 +372,7 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
// This intentionally does NOT handle patterns with a double-negation in
// the sense of not + not, because doing so could result in values
// comparing
- // as equal that hash differently in the min/max/abs cases like:
+ // as equal that hash differently in the min/max cases like:
// select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y
// ^ hashes as min ^ would not hash as min
// In the context of the EarlyCSE pass, however, such cases never reach
@@ -631,11 +627,11 @@ private:
StackNode &operator=(const StackNode &) = delete;
// Accessors.
- unsigned currentGeneration() { return CurrentGeneration; }
- unsigned childGeneration() { return ChildGeneration; }
+ unsigned currentGeneration() const { return CurrentGeneration; }
+ unsigned childGeneration() const { return ChildGeneration; }
void childGeneration(unsigned generation) { ChildGeneration = generation; }
DomTreeNode *node() { return Node; }
- DomTreeNode::const_iterator childIter() { return ChildIter; }
+ DomTreeNode::const_iterator childIter() const { return ChildIter; }
DomTreeNode *nextChild() {
DomTreeNode *child = *ChildIter;
@@ -643,8 +639,8 @@ private:
return child;
}
- DomTreeNode::const_iterator end() { return EndIter; }
- bool isProcessed() { return Processed; }
+ DomTreeNode::const_iterator end() const { return EndIter; }
+ bool isProcessed() const { return Processed; }
void process() { Processed = true; }
private:
@@ -663,29 +659,60 @@ private:
public:
ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
: Inst(Inst) {
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ IntrID = II->getIntrinsicID();
if (TTI.getTgtMemIntrinsic(II, Info))
- IsTargetMemInst = true;
+ return;
+ if (isHandledNonTargetIntrinsic(IntrID)) {
+ switch (IntrID) {
+ case Intrinsic::masked_load:
+ Info.PtrVal = Inst->getOperand(0);
+ Info.MatchingId = Intrinsic::masked_load;
+ Info.ReadMem = true;
+ Info.WriteMem = false;
+ Info.IsVolatile = false;
+ break;
+ case Intrinsic::masked_store:
+ Info.PtrVal = Inst->getOperand(1);
+ // Use the ID of masked load as the "matching id". This will
+ // prevent matching non-masked loads/stores with masked ones
+ // (which could be done), but at the moment, the code here
+ // does not support matching intrinsics with non-intrinsics,
+ // so keep the MatchingIds specific to masked instructions
+ // for now (TODO).
+ Info.MatchingId = Intrinsic::masked_load;
+ Info.ReadMem = false;
+ Info.WriteMem = true;
+ Info.IsVolatile = false;
+ break;
+ }
+ }
+ }
}
+ Instruction *get() { return Inst; }
+ const Instruction *get() const { return Inst; }
+
bool isLoad() const {
- if (IsTargetMemInst) return Info.ReadMem;
+ if (IntrID != 0)
+ return Info.ReadMem;
return isa<LoadInst>(Inst);
}
bool isStore() const {
- if (IsTargetMemInst) return Info.WriteMem;
+ if (IntrID != 0)
+ return Info.WriteMem;
return isa<StoreInst>(Inst);
}
bool isAtomic() const {
- if (IsTargetMemInst)
+ if (IntrID != 0)
return Info.Ordering != AtomicOrdering::NotAtomic;
return Inst->isAtomic();
}
bool isUnordered() const {
- if (IsTargetMemInst)
+ if (IntrID != 0)
return Info.isUnordered();
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -698,7 +725,7 @@ private:
}
bool isVolatile() const {
- if (IsTargetMemInst)
+ if (IntrID != 0)
return Info.IsVolatile;
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -716,11 +743,6 @@ private:
return false;
}
- bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
- return (getPointerOperand() == Inst.getPointerOperand() &&
- getMatchingId() == Inst.getMatchingId());
- }
-
bool isValid() const { return getPointerOperand() != nullptr; }
// For regular (non-intrinsic) loads/stores, this is set to -1. For
@@ -728,44 +750,83 @@ private:
// field in the MemIntrinsicInfo structure. That field contains
// non-negative values only.
int getMatchingId() const {
- if (IsTargetMemInst) return Info.MatchingId;
+ if (IntrID != 0)
+ return Info.MatchingId;
return -1;
}
Value *getPointerOperand() const {
- if (IsTargetMemInst) return Info.PtrVal;
+ if (IntrID != 0)
+ return Info.PtrVal;
return getLoadStorePointerOperand(Inst);
}
bool mayReadFromMemory() const {
- if (IsTargetMemInst) return Info.ReadMem;
+ if (IntrID != 0)
+ return Info.ReadMem;
return Inst->mayReadFromMemory();
}
bool mayWriteToMemory() const {
- if (IsTargetMemInst) return Info.WriteMem;
+ if (IntrID != 0)
+ return Info.WriteMem;
return Inst->mayWriteToMemory();
}
private:
- bool IsTargetMemInst = false;
+ Intrinsic::ID IntrID = 0;
MemIntrinsicInfo Info;
Instruction *Inst;
};
+ // This function is to prevent accidentally passing a non-target
+ // intrinsic ID to TargetTransformInfo.
+ static bool isHandledNonTargetIntrinsic(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_store:
+ return true;
+ }
+ return false;
+ }
+ static bool isHandledNonTargetIntrinsic(const Value *V) {
+ if (auto *II = dyn_cast<IntrinsicInst>(V))
+ return isHandledNonTargetIntrinsic(II->getIntrinsicID());
+ return false;
+ }
+
bool processNode(DomTreeNode *Node);
bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
const BasicBlock *BB, const BasicBlock *Pred);
+ Value *getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
+ unsigned CurrentGeneration);
+
+ bool overridingStores(const ParseMemoryInst &Earlier,
+ const ParseMemoryInst &Later);
+
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
if (auto *LI = dyn_cast<LoadInst>(Inst))
return LI;
if (auto *SI = dyn_cast<StoreInst>(Inst))
return SI->getValueOperand();
assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
- return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
- ExpectedType);
+ auto *II = cast<IntrinsicInst>(Inst);
+ if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
+ return getOrCreateResultNonTargetMemIntrinsic(II, ExpectedType);
+ return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
+ }
+
+ Value *getOrCreateResultNonTargetMemIntrinsic(IntrinsicInst *II,
+ Type *ExpectedType) const {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::masked_load:
+ return II;
+ case Intrinsic::masked_store:
+ return II->getOperand(0);
+ }
+ return nullptr;
}
/// Return true if the instruction is known to only operate on memory
@@ -775,6 +836,101 @@ private:
bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
Instruction *EarlierInst, Instruction *LaterInst);
+ bool isNonTargetIntrinsicMatch(const IntrinsicInst *Earlier,
+ const IntrinsicInst *Later) {
+ auto IsSubmask = [](const Value *Mask0, const Value *Mask1) {
+ // Is Mask0 a submask of Mask1?
+ if (Mask0 == Mask1)
+ return true;
+ if (isa<UndefValue>(Mask0) || isa<UndefValue>(Mask1))
+ return false;
+ auto *Vec0 = dyn_cast<ConstantVector>(Mask0);
+ auto *Vec1 = dyn_cast<ConstantVector>(Mask1);
+ if (!Vec0 || !Vec1)
+ return false;
+ assert(Vec0->getType() == Vec1->getType() &&
+ "Masks should have the same type");
+ for (int i = 0, e = Vec0->getNumOperands(); i != e; ++i) {
+ Constant *Elem0 = Vec0->getOperand(i);
+ Constant *Elem1 = Vec1->getOperand(i);
+ auto *Int0 = dyn_cast<ConstantInt>(Elem0);
+ if (Int0 && Int0->isZero())
+ continue;
+ auto *Int1 = dyn_cast<ConstantInt>(Elem1);
+ if (Int1 && !Int1->isZero())
+ continue;
+ if (isa<UndefValue>(Elem0) || isa<UndefValue>(Elem1))
+ return false;
+ if (Elem0 == Elem1)
+ continue;
+ return false;
+ }
+ return true;
+ };
+ auto PtrOp = [](const IntrinsicInst *II) {
+ if (II->getIntrinsicID() == Intrinsic::masked_load)
+ return II->getOperand(0);
+ if (II->getIntrinsicID() == Intrinsic::masked_store)
+ return II->getOperand(1);
+ llvm_unreachable("Unexpected IntrinsicInst");
+ };
+ auto MaskOp = [](const IntrinsicInst *II) {
+ if (II->getIntrinsicID() == Intrinsic::masked_load)
+ return II->getOperand(2);
+ if (II->getIntrinsicID() == Intrinsic::masked_store)
+ return II->getOperand(3);
+ llvm_unreachable("Unexpected IntrinsicInst");
+ };
+ auto ThruOp = [](const IntrinsicInst *II) {
+ if (II->getIntrinsicID() == Intrinsic::masked_load)
+ return II->getOperand(3);
+ llvm_unreachable("Unexpected IntrinsicInst");
+ };
+
+ if (PtrOp(Earlier) != PtrOp(Later))
+ return false;
+
+ Intrinsic::ID IDE = Earlier->getIntrinsicID();
+ Intrinsic::ID IDL = Later->getIntrinsicID();
+ // We could really use specific intrinsic classes for masked loads
+ // and stores in IntrinsicInst.h.
+ if (IDE == Intrinsic::masked_load && IDL == Intrinsic::masked_load) {
+ // Trying to replace later masked load with the earlier one.
+ // Check that the pointers are the same, and
+ // - masks and pass-throughs are the same, or
+ // - replacee's pass-through is "undef" and replacer's mask is a
+ // super-set of the replacee's mask.
+ if (MaskOp(Earlier) == MaskOp(Later) && ThruOp(Earlier) == ThruOp(Later))
+ return true;
+ if (!isa<UndefValue>(ThruOp(Later)))
+ return false;
+ return IsSubmask(MaskOp(Later), MaskOp(Earlier));
+ }
+ if (IDE == Intrinsic::masked_store && IDL == Intrinsic::masked_load) {
+ // Trying to replace a load of a stored value with the store's value.
+ // Check that the pointers are the same, and
+ // - load's mask is a subset of store's mask, and
+ // - load's pass-through is "undef".
+ if (!IsSubmask(MaskOp(Later), MaskOp(Earlier)))
+ return false;
+ return isa<UndefValue>(ThruOp(Later));
+ }
+ if (IDE == Intrinsic::masked_load && IDL == Intrinsic::masked_store) {
+ // Trying to remove a store of the loaded value.
+ // Check that the pointers are the same, and
+ // - store's mask is a subset of the load's mask.
+ return IsSubmask(MaskOp(Later), MaskOp(Earlier));
+ }
+ if (IDE == Intrinsic::masked_store && IDL == Intrinsic::masked_store) {
+ // Trying to remove a dead store (earlier).
+ // Check that the pointers are the same,
+ // - the to-be-removed store's mask is a subset of the other store's
+ // mask.
+ return IsSubmask(MaskOp(Earlier), MaskOp(Later));
+ }
+ return false;
+ }
+
void removeMSSA(Instruction &Inst) {
if (!MSSA)
return;
@@ -877,9 +1033,14 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
auto *TorF = (BI->getSuccessor(0) == BB)
? ConstantInt::getTrue(BB->getContext())
: ConstantInt::getFalse(BB->getContext());
- auto MatchBinOp = [](Instruction *I, unsigned Opcode) {
- if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(I))
- return BOp->getOpcode() == Opcode;
+ auto MatchBinOp = [](Instruction *I, unsigned Opcode, Value *&LHS,
+ Value *&RHS) {
+ if (Opcode == Instruction::And &&
+ match(I, m_LogicalAnd(m_Value(LHS), m_Value(RHS))))
+ return true;
+ else if (Opcode == Instruction::Or &&
+ match(I, m_LogicalOr(m_Value(LHS), m_Value(RHS))))
+ return true;
return false;
};
// If the condition is AND operation, we can propagate its operands into the
@@ -910,8 +1071,9 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
}
}
- if (MatchBinOp(Curr, PropagateOpcode))
- for (auto &Op : cast<BinaryOperator>(Curr)->operands())
+ Value *LHS, *RHS;
+ if (MatchBinOp(Curr, PropagateOpcode, LHS, RHS))
+ for (auto &Op : { LHS, RHS })
if (Instruction *OPI = dyn_cast<Instruction>(Op))
if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
WorkList.push_back(OPI);
@@ -920,6 +1082,86 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
return MadeChanges;
}
+Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
+ unsigned CurrentGeneration) {
+ if (InVal.DefInst == nullptr)
+ return nullptr;
+ if (InVal.MatchingId != MemInst.getMatchingId())
+ return nullptr;
+ // We don't yet handle removing loads with ordering of any kind.
+ if (MemInst.isVolatile() || !MemInst.isUnordered())
+ return nullptr;
+ // We can't replace an atomic load with one which isn't also atomic.
+ if (MemInst.isLoad() && !InVal.IsAtomic && MemInst.isAtomic())
+ return nullptr;
+ // The value V returned from this function is used differently depending
+ // on whether MemInst is a load or a store. If it's a load, we will replace
+ // MemInst with V, if it's a store, we will check if V is the same as the
+ // available value.
+ bool MemInstMatching = !MemInst.isLoad();
+ Instruction *Matching = MemInstMatching ? MemInst.get() : InVal.DefInst;
+ Instruction *Other = MemInstMatching ? InVal.DefInst : MemInst.get();
+
+ // For stores check the result values before checking memory generation
+ // (otherwise isSameMemGeneration may crash).
+ Value *Result = MemInst.isStore()
+ ? getOrCreateResult(Matching, Other->getType())
+ : nullptr;
+ if (MemInst.isStore() && InVal.DefInst != Result)
+ return nullptr;
+
+ // Deal with non-target memory intrinsics.
+ bool MatchingNTI = isHandledNonTargetIntrinsic(Matching);
+ bool OtherNTI = isHandledNonTargetIntrinsic(Other);
+ if (OtherNTI != MatchingNTI)
+ return nullptr;
+ if (OtherNTI && MatchingNTI) {
+ if (!isNonTargetIntrinsicMatch(cast<IntrinsicInst>(InVal.DefInst),
+ cast<IntrinsicInst>(MemInst.get())))
+ return nullptr;
+ }
+
+ if (!isOperatingOnInvariantMemAt(MemInst.get(), InVal.Generation) &&
+ !isSameMemGeneration(InVal.Generation, CurrentGeneration, InVal.DefInst,
+ MemInst.get()))
+ return nullptr;
+
+ if (!Result)
+ Result = getOrCreateResult(Matching, Other->getType());
+ return Result;
+}
+
+bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
+ const ParseMemoryInst &Later) {
+ // Can we remove Earlier store because of Later store?
+
+ assert(Earlier.isUnordered() && !Earlier.isVolatile() &&
+ "Violated invariant");
+ if (Earlier.getPointerOperand() != Later.getPointerOperand())
+ return false;
+ if (Earlier.getMatchingId() != Later.getMatchingId())
+ return false;
+ // At the moment, we don't remove ordered stores, but do remove
+ // unordered atomic stores. There's no special requirement (for
+ // unordered atomics) about removing atomic stores only in favor of
+ // other atomic stores since we were going to execute the non-atomic
+ // one anyway and the atomic one might never have become visible.
+ if (!Earlier.isUnordered() || !Later.isUnordered())
+ return false;
+
+ // Deal with non-target memory intrinsics.
+ bool ENTI = isHandledNonTargetIntrinsic(Earlier.get());
+ bool LNTI = isHandledNonTargetIntrinsic(Later.get());
+ if (ENTI && LNTI)
+ return isNonTargetIntrinsicMatch(cast<IntrinsicInst>(Earlier.get()),
+ cast<IntrinsicInst>(Later.get()));
+
+ // Because of the check above, at least one of them is false.
+ // For now disallow matching intrinsics with non-intrinsics,
+ // so assume that the stores match if neither is an intrinsic.
+ return ENTI == LNTI;
+}
+
bool EarlyCSE::processNode(DomTreeNode *Node) {
bool Changed = false;
BasicBlock *BB = Node->getBlock();
@@ -990,6 +1232,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
+ // Likewise, noalias intrinsics don't actually write.
+ if (match(&Inst,
+ m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping noalias intrinsic: " << Inst
+ << '\n');
+ continue;
+ }
+
// Skip sideeffect intrinsics, for the same reason as assume intrinsics.
if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
@@ -1136,32 +1386,21 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// we can assume the current load loads the same value as the dominating
// load.
LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
- if (InVal.DefInst != nullptr &&
- InVal.MatchingId == MemInst.getMatchingId() &&
- // We don't yet handle removing loads with ordering of any kind.
- !MemInst.isVolatile() && MemInst.isUnordered() &&
- // We can't replace an atomic load with one which isn't also atomic.
- InVal.IsAtomic >= MemInst.isAtomic() &&
- (isOperatingOnInvariantMemAt(&Inst, InVal.Generation) ||
- isSameMemGeneration(InVal.Generation, CurrentGeneration,
- InVal.DefInst, &Inst))) {
- Value *Op = getOrCreateResult(InVal.DefInst, Inst.getType());
- if (Op != nullptr) {
- LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
- << " to: " << *InVal.DefInst << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- continue;
- }
- if (!Inst.use_empty())
- Inst.replaceAllUsesWith(Op);
- salvageKnowledge(&Inst, &AC);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- ++NumCSELoad;
+ if (Value *Op = getMatchingValue(InVal, MemInst, CurrentGeneration)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
+ << " to: " << *InVal.DefInst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
continue;
}
+ if (!Inst.use_empty())
+ Inst.replaceAllUsesWith(Op);
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ ++NumCSELoad;
+ continue;
}
// Otherwise, remember that we have this instruction.
@@ -1231,13 +1470,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (MemInst.isValid() && MemInst.isStore()) {
LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
if (InVal.DefInst &&
- InVal.DefInst == getOrCreateResult(&Inst, InVal.DefInst->getType()) &&
- InVal.MatchingId == MemInst.getMatchingId() &&
- // We don't yet handle removing stores with ordering of any kind.
- !MemInst.isVolatile() && MemInst.isUnordered() &&
- (isOperatingOnInvariantMemAt(&Inst, InVal.Generation) ||
- isSameMemGeneration(InVal.Generation, CurrentGeneration,
- InVal.DefInst, &Inst))) {
+ InVal.DefInst == getMatchingValue(InVal, MemInst, CurrentGeneration)) {
// It is okay to have a LastStore to a different pointer here if MemorySSA
// tells us that the load and store are from the same memory generation.
// In that case, LastStore should keep its present value since we're
@@ -1272,17 +1505,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (MemInst.isValid() && MemInst.isStore()) {
// We do a trivial form of DSE if there are two stores to the same
// location with no intervening loads. Delete the earlier store.
- // At the moment, we don't remove ordered stores, but do remove
- // unordered atomic stores. There's no special requirement (for
- // unordered atomics) about removing atomic stores only in favor of
- // other atomic stores since we were going to execute the non-atomic
- // one anyway and the atomic one might never have become visible.
if (LastStore) {
- ParseMemoryInst LastStoreMemInst(LastStore, TTI);
- assert(LastStoreMemInst.isUnordered() &&
- !LastStoreMemInst.isVolatile() &&
- "Violated invariant");
- if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+ if (overridingStores(ParseMemoryInst(LastStore, TTI), MemInst)) {
LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
<< " due to: " << Inst << '\n');
if (!DebugCounter::shouldExecute(CSECounter)) {
@@ -1443,6 +1667,7 @@ public:
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
if (UseMemorySSA) {
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
}
@@ -1484,6 +1709,7 @@ INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
"Early CSE w/ MemorySSA", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 72512430b366..e54a270fb276 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -12,6 +12,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 83f4c402ed4d..b6d82685e884 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -19,7 +19,6 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index b16f8591b5a4..c6b6d75aefe8 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -26,8 +26,8 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/DomTreeUpdater.h"
@@ -36,6 +36,8 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PHITransAddr.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -46,7 +48,6 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
@@ -98,21 +99,33 @@ STATISTIC(NumGVNSimpl, "Number of instructions simplified");
STATISTIC(NumGVNEqProp, "Number of equalities propagated");
STATISTIC(NumPRELoad, "Number of loads PRE'd");
+STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax,
+ "Number of blocks speculated as available in "
+ "IsValueFullyAvailableInBlock(), max");
+STATISTIC(MaxBBSpeculationCutoffReachedTimes,
+ "Number of times we we reached gvn-max-block-speculations cut-off "
+ "preventing further exploration");
+
static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden);
static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true));
static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
cl::init(true));
+static cl::opt<bool>
+GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
+ cl::init(true));
static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
-// Maximum allowed recursion depth.
-static cl::opt<uint32_t>
-MaxRecurseDepth("gvn-max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
- cl::desc("Max recurse depth in GVN (default = 1000)"));
-
static cl::opt<uint32_t> MaxNumDeps(
"gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore,
cl::desc("Max number of dependences to attempt Load PRE (default = 100)"));
+// This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat.
+static cl::opt<uint32_t> MaxBBSpeculations(
+ "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore,
+ cl::desc("Max number of blocks we're willing to speculate on (and recurse "
+ "into) when deducing if a value is fully available or not in GVN "
+ "(default = 600)"));
+
struct llvm::GVN::Expression {
uint32_t opcode;
bool commutative = false;
@@ -282,9 +295,9 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
if (I->isCommutative()) {
// Ensure that commutative instructions that only differ by a permutation
// of their operands get the same value number by sorting the operand value
- // numbers. Since all commutative instructions have two operands it is more
+ // numbers. Since commutative operands are the 1st two operands it is more
// efficient to sort by hand rather than using, say, std::sort.
- assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+ assert(I->getNumOperands() >= 2 && "Unsupported commutative instruction!");
if (e.varargs[0] > e.varargs[1])
std::swap(e.varargs[0], e.varargs[1]);
e.commutative = true;
@@ -353,9 +366,7 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
OI != OE; ++OI)
e.varargs.push_back(lookupOrAdd(*OI));
- for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
- II != IE; ++II)
- e.varargs.push_back(*II);
+ append_range(e.varargs, EI->indices());
return e;
}
@@ -399,9 +410,12 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
}
if (local_dep.isDef()) {
- CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+ // For masked load/store intrinsics, the local_dep may actully be
+ // a normal load or store instruction.
+ CallInst *local_cdep = dyn_cast<CallInst>(local_dep.getInst());
- if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+ if (!local_cdep ||
+ local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
valueNumbering[C] = nextValueNumber;
return nextValueNumber++;
}
@@ -626,6 +640,11 @@ bool GVN::isLoadInLoopPREEnabled() const {
return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
}
+bool GVN::isLoadPRESplitBackedgeEnabled() const {
+ return Options.AllowLoadPRESplitBackedge.getValueOr(
+ GVNEnableSplitBackedgeInLoadPRE);
+}
+
bool GVN::isMemDepEnabled() const {
return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
}
@@ -642,14 +661,18 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
auto *MemDep =
isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+ auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE);
+ bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
+ MSSA ? &MSSA->getMSSA() : nullptr);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<GlobalsAA>();
PA.preserve<TargetLibraryAnalysis>();
+ if (MSSA)
+ PA.preserve<MemorySSAAnalysis>();
if (LI)
PA.preserve<LoopAnalysis>();
return PA;
@@ -667,6 +690,18 @@ LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
}
#endif
+enum class AvailabilityState : char {
+ /// We know the block *is not* fully available. This is a fixpoint.
+ Unavailable = 0,
+ /// We know the block *is* fully available. This is a fixpoint.
+ Available = 1,
+ /// We do not know whether the block is fully available or not,
+ /// but we are currently speculating that it will be.
+ /// If it would have turned out that the block was, in fact, not fully
+ /// available, this would have been cleaned up into an Unavailable.
+ SpeculativelyAvailable = 2,
+};
+
/// Return true if we can prove that the value
/// we're analyzing is fully available in the specified block. As we go, keep
/// track of which blocks we know are fully alive in FullyAvailableBlocks. This
@@ -675,76 +710,118 @@ LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
/// 1) we know the block *is* fully available.
/// 2) we do not know whether the block is fully available or not, but we are
/// currently speculating that it will be.
-/// 3) we are speculating for this block and have used that to speculate for
-/// other blocks.
-static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
- DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
- uint32_t RecurseDepth) {
- if (RecurseDepth > MaxRecurseDepth)
- return false;
-
- // Optimistically assume that the block is fully available and check to see
- // if we already know about this block in one lookup.
- std::pair<DenseMap<BasicBlock*, char>::iterator, bool> IV =
- FullyAvailableBlocks.insert(std::make_pair(BB, 2));
-
- // If the entry already existed for this block, return the precomputed value.
- if (!IV.second) {
- // If this is a speculative "available" value, mark it as being used for
- // speculation of other blocks.
- if (IV.first->second == 2)
- IV.first->second = 3;
- return IV.first->second != 0;
- }
-
- // Otherwise, see if it is fully available in all predecessors.
- pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-
- // If this block has no predecessors, it isn't live-in here.
- if (PI == PE)
- goto SpeculationFailure;
+static bool IsValueFullyAvailableInBlock(
+ BasicBlock *BB,
+ DenseMap<BasicBlock *, AvailabilityState> &FullyAvailableBlocks) {
+ SmallVector<BasicBlock *, 32> Worklist;
+ Optional<BasicBlock *> UnavailableBB;
+
+ // The number of times we didn't find an entry for a block in a map and
+ // optimistically inserted an entry marking block as speculatively available.
+ unsigned NumNewNewSpeculativelyAvailableBBs = 0;
+
+#ifndef NDEBUG
+ SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
+ SmallVector<BasicBlock *, 32> AvailableBBs;
+#endif
- for (; PI != PE; ++PI)
- // If the value isn't fully available in one of our predecessors, then it
- // isn't fully available in this block either. Undo our previous
- // optimistic assumption and bail out.
- if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
- goto SpeculationFailure;
+ Worklist.emplace_back(BB);
+ while (!Worklist.empty()) {
+ BasicBlock *CurrBB = Worklist.pop_back_val(); // LIFO - depth-first!
+ // Optimistically assume that the block is Speculatively Available and check
+ // to see if we already know about this block in one lookup.
+ std::pair<DenseMap<BasicBlock *, AvailabilityState>::iterator, bool> IV =
+ FullyAvailableBlocks.try_emplace(
+ CurrBB, AvailabilityState::SpeculativelyAvailable);
+ AvailabilityState &State = IV.first->second;
+
+ // Did the entry already exist for this block?
+ if (!IV.second) {
+ if (State == AvailabilityState::Unavailable) {
+ UnavailableBB = CurrBB;
+ break; // Backpropagate unavailability info.
+ }
- return true;
+#ifndef NDEBUG
+ AvailableBBs.emplace_back(CurrBB);
+#endif
+ continue; // Don't recurse further, but continue processing worklist.
+ }
-// If we get here, we found out that this is not, after
-// all, a fully-available block. We have a problem if we speculated on this and
-// used the speculation to mark other blocks as available.
-SpeculationFailure:
- char &BBVal = FullyAvailableBlocks[BB];
+ // No entry found for block.
+ ++NumNewNewSpeculativelyAvailableBBs;
+ bool OutOfBudget = NumNewNewSpeculativelyAvailableBBs > MaxBBSpeculations;
+
+ // If we have exhausted our budget, mark this block as unavailable.
+ // Also, if this block has no predecessors, the value isn't live-in here.
+ if (OutOfBudget || pred_empty(CurrBB)) {
+ MaxBBSpeculationCutoffReachedTimes += (int)OutOfBudget;
+ State = AvailabilityState::Unavailable;
+ UnavailableBB = CurrBB;
+ break; // Backpropagate unavailability info.
+ }
- // If we didn't speculate on this, just return with it set to false.
- if (BBVal == 2) {
- BBVal = 0;
- return false;
+ // Tentatively consider this block as speculatively available.
+#ifndef NDEBUG
+ NewSpeculativelyAvailableBBs.insert(CurrBB);
+#endif
+ // And further recurse into block's predecessors, in depth-first order!
+ Worklist.append(pred_begin(CurrBB), pred_end(CurrBB));
}
- // If we did speculate on this value, we could have blocks set to 1 that are
- // incorrect. Walk the (transitive) successors of this block and mark them as
- // 0 if set to one.
- SmallVector<BasicBlock*, 32> BBWorklist;
- BBWorklist.push_back(BB);
-
- do {
- BasicBlock *Entry = BBWorklist.pop_back_val();
- // Note that this sets blocks to 0 (unavailable) if they happen to not
- // already be in FullyAvailableBlocks. This is safe.
- char &EntryVal = FullyAvailableBlocks[Entry];
- if (EntryVal == 0) continue; // Already unavailable.
-
- // Mark as unavailable.
- EntryVal = 0;
+#if LLVM_ENABLE_STATS
+ IsValueFullyAvailableInBlockNumSpeculationsMax.updateMax(
+ NumNewNewSpeculativelyAvailableBBs);
+#endif
- BBWorklist.append(succ_begin(Entry), succ_end(Entry));
- } while (!BBWorklist.empty());
+ // If the block isn't marked as fixpoint yet
+ // (the Unavailable and Available states are fixpoints)
+ auto MarkAsFixpointAndEnqueueSuccessors =
+ [&](BasicBlock *BB, AvailabilityState FixpointState) {
+ auto It = FullyAvailableBlocks.find(BB);
+ if (It == FullyAvailableBlocks.end())
+ return; // Never queried this block, leave as-is.
+ switch (AvailabilityState &State = It->second) {
+ case AvailabilityState::Unavailable:
+ case AvailabilityState::Available:
+ return; // Don't backpropagate further, continue processing worklist.
+ case AvailabilityState::SpeculativelyAvailable: // Fix it!
+ State = FixpointState;
+#ifndef NDEBUG
+ assert(NewSpeculativelyAvailableBBs.erase(BB) &&
+ "Found a speculatively available successor leftover?");
+#endif
+ // Queue successors for further processing.
+ Worklist.append(succ_begin(BB), succ_end(BB));
+ return;
+ }
+ };
+
+ if (UnavailableBB) {
+ // Okay, we have encountered an unavailable block.
+ // Mark speculatively available blocks reachable from UnavailableBB as
+ // unavailable as well. Paths are terminated when they reach blocks not in
+ // FullyAvailableBlocks or they are not marked as speculatively available.
+ Worklist.clear();
+ Worklist.append(succ_begin(*UnavailableBB), succ_end(*UnavailableBB));
+ while (!Worklist.empty())
+ MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
+ AvailabilityState::Unavailable);
+ }
+
+#ifndef NDEBUG
+ Worklist.clear();
+ for (BasicBlock *AvailableBB : AvailableBBs)
+ Worklist.append(succ_begin(AvailableBB), succ_end(AvailableBB));
+ while (!Worklist.empty())
+ MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
+ AvailabilityState::Available);
+
+ assert(NewSpeculativelyAvailableBBs.empty() &&
+ "Must have fixed all the new speculatively available blocks.");
+#endif
- return false;
+ return !UnavailableBB;
}
/// Given a set of loads specified by ValuesPerBlock,
@@ -963,7 +1040,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
// Reject loads and stores that are to the same address but are of
- // different types if we have to. If the stored value is larger or equal to
+ // different types if we have to. If the stored value is convertable to
// the loaded value, we can reuse it.
if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(),
DL))
@@ -1062,7 +1139,6 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// backwards through predecessors if needed.
BasicBlock *LoadBB = LI->getParent();
BasicBlock *TmpBB = LoadBB;
- bool IsSafeToSpeculativelyExecute = isSafeToSpeculativelyExecute(LI);
// Check that there is no implicit control flow instructions above our load in
// its block. If there is an instruction that doesn't always pass the
@@ -1079,8 +1155,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// because if the index is out of bounds we should deoptimize rather than
// access the array.
// Check that there is no guard in this block above our instruction.
- if (!IsSafeToSpeculativelyExecute && ICF->isDominatedByICFIFromSameBlock(LI))
- return false;
+ bool MustEnsureSafetyOfSpeculativeExecution =
+ ICF->isDominatedByICFIFromSameBlock(LI);
+
while (TmpBB->getSinglePredecessor()) {
TmpBB = TmpBB->getSinglePredecessor();
if (TmpBB == LoadBB) // Infinite (unreachable) loop.
@@ -1097,8 +1174,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
return false;
// Check that there is no implicit control flow in a block above.
- if (!IsSafeToSpeculativelyExecute && ICF->hasICF(TmpBB))
- return false;
+ MustEnsureSafetyOfSpeculativeExecution =
+ MustEnsureSafetyOfSpeculativeExecution || ICF->hasICF(TmpBB);
}
assert(TmpBB);
@@ -1107,11 +1184,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// Check to see how many predecessors have the loaded value fully
// available.
MapVector<BasicBlock *, Value *> PredLoads;
- DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+ DenseMap<BasicBlock *, AvailabilityState> FullyAvailableBlocks;
for (const AvailableValueInBlock &AV : ValuesPerBlock)
- FullyAvailableBlocks[AV.BB] = true;
+ FullyAvailableBlocks[AV.BB] = AvailabilityState::Available;
for (BasicBlock *UnavailableBB : UnavailableBlocks)
- FullyAvailableBlocks[UnavailableBB] = false;
+ FullyAvailableBlocks[UnavailableBB] = AvailabilityState::Unavailable;
SmallVector<BasicBlock *, 4> CriticalEdgePred;
for (BasicBlock *Pred : predecessors(LoadBB)) {
@@ -1124,7 +1201,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
return false;
}
- if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+ if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) {
continue;
}
@@ -1151,6 +1228,16 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
return false;
}
+ // Do not split backedge as it will break the canonical loop form.
+ if (!isLoadPRESplitBackedgeEnabled())
+ if (DT->dominates(LoadBB, Pred)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "COULD NOT PRE LOAD BECAUSE OF A BACKEDGE CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
CriticalEdgePred.push_back(Pred);
} else {
// Only add the predecessors that will not be split for now.
@@ -1170,6 +1257,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (NumUnavailablePreds != 1)
return false;
+ // Now we know where we will insert load. We must ensure that it is safe
+ // to speculatively execute the load at that points.
+ if (MustEnsureSafetyOfSpeculativeExecution) {
+ if (CriticalEdgePred.size())
+ if (!isSafeToSpeculativelyExecute(LI, LoadBB->getFirstNonPHI(), DT))
+ return false;
+ for (auto &PL : PredLoads)
+ if (!isSafeToSpeculativelyExecute(LI, PL.first->getTerminator(), DT))
+ return false;
+ }
+
// Split critical edges, and update the unavailable predecessors accordingly.
for (BasicBlock *OrigPred : CriticalEdgePred) {
BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
@@ -1251,8 +1349,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// Instructions that have been inserted in predecessor(s) to materialize
// the load address do not retain their original debug locations. Doing
// so could lead to confusing (but correct) source attributions.
- if (const DebugLoc &DL = I->getDebugLoc())
- I->setDebugLoc(DebugLoc::get(0, 0, DL.getScope(), DL.getInlinedAt()));
+ I->updateLocationAfterHoist();
// FIXME: We really _ought_ to insert these value numbers into their
// parent's availability map. However, in doing so, we risk getting into
@@ -1270,6 +1367,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(),
UnavailablePred->getTerminator());
NewLoad->setDebugLoc(LI->getDebugLoc());
+ if (MSSAU) {
+ auto *MSSA = MSSAU->getMemorySSA();
+ // Get the defining access of the original load or use the load if it is a
+ // MemoryDef (e.g. because it is volatile). The inserted loads are
+ // guaranteed to load from the same definition.
+ auto *LIAcc = MSSA->getMemoryAccess(LI);
+ auto *DefiningAcc =
+ isa<MemoryDef>(LIAcc) ? LIAcc : LIAcc->getDefiningAccess();
+ auto *NewAccess = MSSAU->createMemoryAccessInBB(
+ NewLoad, DefiningAcc, NewLoad->getParent(),
+ MemorySSA::BeforeTerminator);
+ if (auto *NewDef = dyn_cast<MemoryDef>(NewAccess))
+ MSSAU->insertDef(NewDef, /*RenameUses=*/true);
+ else
+ MSSAU->insertUse(cast<MemoryUse>(NewAccess), /*RenameUses=*/true);
+ }
// Transfer the old load's AA tags to the new load.
AAMDNodes Tags;
@@ -1357,13 +1470,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
return false;
}
+ bool Changed = false;
// If this load follows a GEP, see if we can PRE the indices before analyzing.
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
OE = GEP->idx_end();
OI != OE; ++OI)
if (Instruction *I = dyn_cast<Instruction>(OI->get()))
- performScalarPRE(I);
+ Changed |= performScalarPRE(I);
}
// Step 2: Analyze the availability of the load
@@ -1374,7 +1488,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// If we have no predecessors that produce a known value for this load, exit
// early.
if (ValuesPerBlock.empty())
- return false;
+ return Changed;
// Step 3: Eliminate fully redundancy.
//
@@ -1406,12 +1520,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// Step 4: Eliminate partial redundancy.
if (!isPREEnabled() || !isLoadPREEnabled())
- return false;
+ return Changed;
if (!isLoadInLoopPREEnabled() && this->LI &&
this->LI->getLoopFor(LI->getParent()))
- return false;
+ return Changed;
- return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+ return Changed || PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
}
static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
@@ -1486,9 +1600,40 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
// Insert a new store to null instruction before the load to indicate that
// this code is not reachable. FIXME: We could insert unreachable
// instruction directly because we can modify the CFG.
- new StoreInst(UndefValue::get(Int8Ty),
- Constant::getNullValue(Int8Ty->getPointerTo()),
- IntrinsicI);
+ auto *NewS = new StoreInst(UndefValue::get(Int8Ty),
+ Constant::getNullValue(Int8Ty->getPointerTo()),
+ IntrinsicI);
+ if (MSSAU) {
+ const MemoryUseOrDef *FirstNonDom = nullptr;
+ const auto *AL =
+ MSSAU->getMemorySSA()->getBlockAccesses(IntrinsicI->getParent());
+
+ // If there are accesses in the current basic block, find the first one
+ // that does not come before NewS. The new memory access is inserted
+ // after the found access or before the terminator if no such access is
+ // found.
+ if (AL) {
+ for (auto &Acc : *AL) {
+ if (auto *Current = dyn_cast<MemoryUseOrDef>(&Acc))
+ if (!Current->getMemoryInst()->comesBefore(NewS)) {
+ FirstNonDom = Current;
+ break;
+ }
+ }
+ }
+
+ // This added store is to null, so it will never executed and we can
+ // just use the LiveOnEntry def as defining access.
+ auto *NewDef =
+ FirstNonDom ? MSSAU->createMemoryAccessBefore(
+ NewS, MSSAU->getMemorySSA()->getLiveOnEntryDef(),
+ const_cast<MemoryUseOrDef *>(FirstNonDom))
+ : MSSAU->createMemoryAccessInBB(
+ NewS, MSSAU->getMemorySSA()->getLiveOnEntryDef(),
+ NewS->getParent(), MemorySSA::BeforeTerminator);
+
+ MSSAU->insertDef(cast<MemoryDef>(NewDef), /*RenameUses=*/false);
+ }
}
if (isAssumeWithEmptyBundle(*IntrinsicI))
markInstructionForDeletion(IntrinsicI);
@@ -1516,6 +1661,11 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
// br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
ReplaceOperandsWithMap[V] = True;
+ // Similarly, after assume(!NotV) we know that NotV == false.
+ Value *NotV;
+ if (match(V, m_Not(m_Value(NotV))))
+ ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext());
+
// If we find an equality fact, canonicalize all dominated uses in this block
// to one of the two values. We heuristically choice the "oldest" of the
// two where age is determined by value number. (Note that propagateEquality
@@ -1622,6 +1772,8 @@ bool GVN::processLoad(LoadInst *L) {
// Replace the load!
patchAndReplaceAllUsesWith(L, AvailableValue);
markInstructionForDeletion(L);
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(L);
++NumGVNLoad;
reportLoadElim(L, AvailableValue, ORE);
// Tell MDA to rexamine the reused pointer since we might have more
@@ -1743,7 +1895,7 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
}
if (Exp.commutative) {
- assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+ assert(Exp.varargs.size() >= 2 && "Unsupported commutative instruction!");
if (Exp.varargs[0] > Exp.varargs[1]) {
std::swap(Exp.varargs[0], Exp.varargs[1]);
uint32_t Opcode = Exp.opcode >> 8;
@@ -1766,11 +1918,8 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
/// again.
void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
const BasicBlock &CurrBlock) {
- for (const BasicBlock *Pred : predecessors(&CurrBlock)) {
- auto FindRes = PhiTranslateTable.find({Num, Pred});
- if (FindRes != PhiTranslateTable.end())
- PhiTranslateTable.erase(FindRes);
- }
+ for (const BasicBlock *Pred : predecessors(&CurrBlock))
+ PhiTranslateTable.erase({Num, Pred});
}
// In order to find a leader for a given value number at a
@@ -1934,8 +2083,8 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
// If "A && B" is known true then both A and B are known true. If "A || B"
// is known false then both A and B are known false.
Value *A, *B;
- if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
- (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+ if ((isKnownTrue && match(LHS, m_LogicalAnd(m_Value(A), m_Value(B)))) ||
+ (isKnownFalse && match(LHS, m_LogicalOr(m_Value(A), m_Value(B))))) {
Worklist.push_back(std::make_pair(A, RHS));
Worklist.push_back(std::make_pair(B, RHS));
continue;
@@ -2137,7 +2286,7 @@ bool GVN::processInstruction(Instruction *I) {
bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
const TargetLibraryInfo &RunTLI, AAResults &RunAA,
MemoryDependenceResults *RunMD, LoopInfo *LI,
- OptimizationRemarkEmitter *RunORE) {
+ OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) {
AC = &RunAC;
DT = &RunDT;
VN.setDomTree(DT);
@@ -2150,6 +2299,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
VN.setMemDep(MD);
ORE = RunORE;
InvalidBlockRPONumbers = true;
+ MemorySSAUpdater Updater(MSSA);
+ MSSAU = MSSA ? &Updater : nullptr;
bool Changed = false;
bool ShouldContinue = true;
@@ -2160,7 +2311,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
BasicBlock *BB = &*FI++;
- bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, nullptr, MD);
+ bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, MSSAU, MD);
if (removedBlock)
++NumGVNBlocks;
@@ -2196,6 +2347,9 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
// iteration.
DeadBlocks.clear();
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
return Changed;
}
@@ -2236,6 +2390,8 @@ bool GVN::processBlock(BasicBlock *BB) {
salvageKnowledge(I, AC);
salvageDebugInfo(*I);
if (MD) MD->removeInstruction(I);
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I);
LLVM_DEBUG(verifyRemoved(I));
ICF->removeInstruction(I);
I->eraseFromParent();
@@ -2323,10 +2479,14 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
if (isa<GetElementPtrInst>(CurInst))
return false;
- // We don't currently value number ANY inline asm calls.
- if (auto *CallB = dyn_cast<CallBase>(CurInst))
+ if (auto *CallB = dyn_cast<CallBase>(CurInst)) {
+ // We don't currently value number ANY inline asm calls.
if (CallB->isInlineAsm())
return false;
+ // Don't do PRE on convergent calls.
+ if (CallB->isConvergent())
+ return false;
+ }
uint32_t ValNo = VN.lookup(CurInst);
@@ -2466,6 +2626,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
if (MD)
MD->removeInstruction(CurInst);
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(CurInst);
LLVM_DEBUG(verifyRemoved(CurInst));
// FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
// some assertion failures.
@@ -2510,10 +2672,12 @@ BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
// possible.
BasicBlock *BB = SplitCriticalEdge(
Pred, Succ,
- CriticalEdgeSplittingOptions(DT, LI).unsetPreserveLoopSimplify());
- if (MD)
- MD->invalidateCachedPredecessors();
- InvalidBlockRPONumbers = true;
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU).unsetPreserveLoopSimplify());
+ if (BB) {
+ if (MD)
+ MD->invalidateCachedPredecessors();
+ InvalidBlockRPONumbers = true;
+ }
return BB;
}
@@ -2522,14 +2686,20 @@ BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
bool GVN::splitCriticalEdges() {
if (toSplit.empty())
return false;
+
+ bool Changed = false;
do {
std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
- SplitCriticalEdge(Edge.first, Edge.second,
- CriticalEdgeSplittingOptions(DT, LI));
+ Changed |= SplitCriticalEdge(Edge.first, Edge.second,
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU)) !=
+ nullptr;
} while (!toSplit.empty());
- if (MD) MD->invalidateCachedPredecessors();
- InvalidBlockRPONumbers = true;
- return true;
+ if (Changed) {
+ if (MD)
+ MD->invalidateCachedPredecessors();
+ InvalidBlockRPONumbers = true;
+ }
+ return Changed;
}
/// Executes one iteration of GVN
@@ -2633,13 +2803,12 @@ void GVN::addDeadBlock(BasicBlock *BB) {
// First, split the critical edges. This might also create additional blocks
// to preserve LoopSimplify form and adjust edges accordingly.
- SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+ SmallVector<BasicBlock *, 4> Preds(predecessors(B));
for (BasicBlock *P : Preds) {
if (!DeadBlocks.count(P))
continue;
- if (llvm::any_of(successors(P),
- [B](BasicBlock *Succ) { return Succ == B; }) &&
+ if (llvm::is_contained(successors(P), B) &&
isCriticalEdge(P->getTerminator(), B)) {
if (BasicBlock *S = splitCriticalEdges(P, B))
DeadBlocks.insert(P = S);
@@ -2724,6 +2893,7 @@ public:
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
return Impl.runImpl(
F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
@@ -2733,7 +2903,8 @@ public:
? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()
: nullptr,
LIWP ? &LIWP->getLoopInfo() : nullptr,
- &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(),
+ MSSAWP ? &MSSAWP->getMSSA() : nullptr);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -2744,12 +2915,12 @@ public:
if (Impl.isMemDepEnabled())
AU.addRequired<MemoryDependenceWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
-
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addPreserved<TargetLibraryInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
}
private:
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 9c4cdf2feb56..8d0bd5674964 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -149,8 +149,8 @@ struct CHIArg {
// The instruction (VN) which uses the values flowing out of CHI.
Instruction *I;
- bool operator==(const CHIArg &A) { return VN == A.VN; }
- bool operator!=(const CHIArg &A) { return !(*this == A); }
+ bool operator==(const CHIArg &A) const { return VN == A.VN; }
+ bool operator!=(const CHIArg &A) const { return !(*this == A); }
};
using CHIIt = SmallVectorImpl<CHIArg>::iterator;
@@ -242,11 +242,14 @@ public:
};
static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
- static const unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias, LLVMContext::MD_range,
- LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
- LLVMContext::MD_invariant_group, LLVMContext::MD_access_group};
+ static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_range,
+ LLVMContext::MD_fpmath,
+ LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group,
+ LLVMContext::MD_access_group};
combineMetadata(ReplInst, I, KnownIDs, true);
}
@@ -260,43 +263,7 @@ public:
: DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
- bool run(Function &F) {
- NumFuncArgs = F.arg_size();
- VN.setDomTree(DT);
- VN.setAliasAnalysis(AA);
- VN.setMemDep(MD);
- bool Res = false;
- // Perform DFS Numbering of instructions.
- unsigned BBI = 0;
- for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
- DFSNumber[BB] = ++BBI;
- unsigned I = 0;
- for (auto &Inst : *BB)
- DFSNumber[&Inst] = ++I;
- }
-
- int ChainLength = 0;
-
- // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
- while (true) {
- if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
- return Res;
-
- auto HoistStat = hoistExpressions(F);
- if (HoistStat.first + HoistStat.second == 0)
- return Res;
-
- if (HoistStat.second > 0)
- // To address a limitation of the current GVN, we need to rerun the
- // hoisting after we hoisted loads or stores in order to be able to
- // hoist all scalars dependent on the hoisted ld/st.
- VN.clear();
-
- Res = true;
- }
-
- return Res;
- }
+ bool run(Function &F);
// Copied from NewGVN.cpp
// This function provides global ranking of operations so that we can place
@@ -304,27 +271,7 @@ public:
// for a complete ordering, as constants all have the same rank. However,
// generally, we will simplify an operation with all constants so that it
// doesn't matter what order they appear in.
- unsigned int rank(const Value *V) const {
- // Prefer constants to undef to anything else
- // Undef is a constant, have to check it first.
- // Prefer smaller constants to constantexprs
- if (isa<ConstantExpr>(V))
- return 2;
- if (isa<UndefValue>(V))
- return 1;
- if (isa<Constant>(V))
- return 0;
- else if (auto *A = dyn_cast<Argument>(V))
- return 3 + A->getArgNo();
-
- // Need to shift the instruction DFS by number of arguments + 3 to account
- // for the constant and argument ranking above.
- auto Result = DFSNumber.lookup(V);
- if (Result > 0)
- return 4 + NumFuncArgs + Result;
- // Unreachable or something else, just return a really large number.
- return ~0;
- }
+ unsigned int rank(const Value *V) const;
private:
GVN::ValueTable VN;
@@ -344,33 +291,7 @@ private:
enum InsKind { Unknown, Scalar, Load, Store };
// Return true when there are exception handling in BB.
- bool hasEH(const BasicBlock *BB) {
- auto It = BBSideEffects.find(BB);
- if (It != BBSideEffects.end())
- return It->second;
-
- if (BB->isEHPad() || BB->hasAddressTaken()) {
- BBSideEffects[BB] = true;
- return true;
- }
-
- if (BB->getTerminator()->mayThrow()) {
- BBSideEffects[BB] = true;
- return true;
- }
-
- BBSideEffects[BB] = false;
- return false;
- }
-
- // Return true when a successor of BB dominates A.
- bool successorDominate(const BasicBlock *BB, const BasicBlock *A) {
- for (const BasicBlock *Succ : successors(BB))
- if (DT->dominates(Succ, A))
- return true;
-
- return false;
- }
+ bool hasEH(const BasicBlock *BB);
// Return true when I1 appears before I2 in the instructions of BB.
bool firstInBB(const Instruction *I1, const Instruction *I2) {
@@ -383,57 +304,10 @@ private:
// Return true when there are memory uses of Def in BB.
bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
- const BasicBlock *BB) {
- const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
- if (!Acc)
- return false;
-
- Instruction *OldPt = Def->getMemoryInst();
- const BasicBlock *OldBB = OldPt->getParent();
- const BasicBlock *NewBB = NewPt->getParent();
- bool ReachedNewPt = false;
-
- for (const MemoryAccess &MA : *Acc)
- if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
- Instruction *Insn = MU->getMemoryInst();
-
- // Do not check whether MU aliases Def when MU occurs after OldPt.
- if (BB == OldBB && firstInBB(OldPt, Insn))
- break;
-
- // Do not check whether MU aliases Def when MU occurs before NewPt.
- if (BB == NewBB) {
- if (!ReachedNewPt) {
- if (firstInBB(Insn, NewPt))
- continue;
- ReachedNewPt = true;
- }
- }
- if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
- return true;
- }
-
- return false;
- }
+ const BasicBlock *BB);
bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
- int &NBBsOnAllPaths) {
- // Stop walk once the limit is reached.
- if (NBBsOnAllPaths == 0)
- return true;
-
- // Impossible to hoist with exceptions on the path.
- if (hasEH(BB))
- return true;
-
- // No such instruction after HoistBarrier in a basic block was
- // selected for hoisting so instructions selected within basic block with
- // a hoist barrier can be hoisted.
- if ((BB != SrcBB) && HoistBarrier.count(BB))
- return true;
-
- return false;
- }
+ int &NBBsOnAllPaths);
// Return true when there are exception handling or loads of memory Def
// between Def and NewPt. This function is only called for stores: Def is
@@ -443,118 +317,19 @@ private:
// return true when the counter NBBsOnAllPaths reaces 0, except when it is
// initialized to -1 which is unlimited.
bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
- int &NBBsOnAllPaths) {
- const BasicBlock *NewBB = NewPt->getParent();
- const BasicBlock *OldBB = Def->getBlock();
- assert(DT->dominates(NewBB, OldBB) && "invalid path");
- assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
- "def does not dominate new hoisting point");
-
- // Walk all basic blocks reachable in depth-first iteration on the inverse
- // CFG from OldBB to NewBB. These blocks are all the blocks that may be
- // executed between the execution of NewBB and OldBB. Hoisting an expression
- // from OldBB into NewBB has to be safe on all execution paths.
- for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
- const BasicBlock *BB = *I;
- if (BB == NewBB) {
- // Stop traversal when reaching HoistPt.
- I.skipChildren();
- continue;
- }
-
- if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
- return true;
-
- // Check that we do not move a store past loads.
- if (hasMemoryUse(NewPt, Def, BB))
- return true;
-
- // -1 is unlimited number of blocks on all paths.
- if (NBBsOnAllPaths != -1)
- --NBBsOnAllPaths;
-
- ++I;
- }
-
- return false;
- }
+ int &NBBsOnAllPaths);
// Return true when there are exception handling between HoistPt and BB.
// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
// return true when the counter NBBsOnAllPaths reaches 0, except when it is
// initialized to -1 which is unlimited.
bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
- int &NBBsOnAllPaths) {
- assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
-
- // Walk all basic blocks reachable in depth-first iteration on
- // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
- // blocks that may be executed between the execution of NewHoistPt and
- // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
- // on all execution paths.
- for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
- const BasicBlock *BB = *I;
- if (BB == HoistPt) {
- // Stop traversal when reaching NewHoistPt.
- I.skipChildren();
- continue;
- }
-
- if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
- return true;
-
- // -1 is unlimited number of blocks on all paths.
- if (NBBsOnAllPaths != -1)
- --NBBsOnAllPaths;
-
- ++I;
- }
-
- return false;
- }
+ int &NBBsOnAllPaths);
// Return true when it is safe to hoist a memory load or store U from OldPt
// to NewPt.
bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
- MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
- // In place hoisting is safe.
- if (NewPt == OldPt)
- return true;
-
- const BasicBlock *NewBB = NewPt->getParent();
- const BasicBlock *OldBB = OldPt->getParent();
- const BasicBlock *UBB = U->getBlock();
-
- // Check for dependences on the Memory SSA.
- MemoryAccess *D = U->getDefiningAccess();
- BasicBlock *DBB = D->getBlock();
- if (DT->properlyDominates(NewBB, DBB))
- // Cannot move the load or store to NewBB above its definition in DBB.
- return false;
-
- if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
- if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
- if (!firstInBB(UD->getMemoryInst(), NewPt))
- // Cannot move the load or store to NewPt above its definition in D.
- return false;
-
- // Check for unsafe hoistings due to side effects.
- if (K == InsKind::Store) {
- if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
- return false;
- } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
- return false;
-
- if (UBB == NewBB) {
- if (DT->properlyDominates(DBB, NewBB))
- return true;
- assert(UBB == DBB);
- assert(MSSA->locallyDominates(D, U));
- }
-
- // No side effects: it is safe to hoist.
- return true;
- }
+ MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths);
// Return true when it is safe to hoist scalar instructions from all blocks in
// WL to HoistBB.
@@ -577,92 +352,21 @@ private:
// Returns the edge via which an instruction in BB will get the values from.
// Returns true when the values are flowing out to each edge.
- bool valueAnticipable(CHIArgs C, Instruction *TI) const {
- if (TI->getNumSuccessors() > (unsigned)size(C))
- return false; // Not enough args in this CHI.
-
- for (auto CHI : C) {
- BasicBlock *Dest = CHI.Dest;
- // Find if all the edges have values flowing out of BB.
- bool Found = llvm::any_of(
- successors(TI), [Dest](const BasicBlock *BB) { return BB == Dest; });
- if (!Found)
- return false;
- }
- return true;
- }
+ bool valueAnticipable(CHIArgs C, Instruction *TI) const;
// Check if it is safe to hoist values tracked by CHI in the range
// [Begin, End) and accumulate them in Safe.
void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
- SmallVectorImpl<CHIArg> &Safe) {
- int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
- for (auto CHI : C) {
- Instruction *Insn = CHI.I;
- if (!Insn) // No instruction was inserted in this CHI.
- continue;
- if (K == InsKind::Scalar) {
- if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
- Safe.push_back(CHI);
- } else {
- MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
- if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
- Safe.push_back(CHI);
- }
- }
- }
+ SmallVectorImpl<CHIArg> &Safe);
using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
// Push all the VNs corresponding to BB into RenameStack.
void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
- RenameStackType &RenameStack) {
- auto it1 = ValueBBs.find(BB);
- if (it1 != ValueBBs.end()) {
- // Iterate in reverse order to keep lower ranked values on the top.
- for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
- // Get the value of instruction I
- LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
- RenameStack[VI.first].push_back(VI.second);
- }
- }
- }
+ RenameStackType &RenameStack);
void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
- RenameStackType &RenameStack) {
- // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
- for (auto Pred : predecessors(BB)) {
- auto P = CHIBBs.find(Pred);
- if (P == CHIBBs.end()) {
- continue;
- }
- LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
- // A CHI is found (BB -> Pred is an edge in the CFG)
- // Pop the stack until Top(V) = Ve.
- auto &VCHI = P->second;
- for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
- CHIArg &C = *It;
- if (!C.Dest) {
- auto si = RenameStack.find(C.VN);
- // The Basic Block where CHI is must dominate the value we want to
- // track in a CHI. In the PDom walk, there can be values in the
- // stack which are not control dependent e.g., nested loop.
- if (si != RenameStack.end() && si->second.size() &&
- DT->properlyDominates(Pred, si->second.back()->getParent())) {
- C.Dest = BB; // Assign the edge
- C.I = si->second.pop_back_val(); // Assign the argument
- LLVM_DEBUG(dbgs()
- << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
- << ", VN: " << C.VN.first << ", " << C.VN.second);
- }
- // Move to next CHI of a different value
- It = std::find_if(It, VCHI.end(),
- [It](CHIArg &A) { return A != *It; });
- } else
- ++It;
- }
- }
- }
+ RenameStackType &RenameStack);
// Walk the post-dominator tree top-down and use a stack for each value to
// store the last value you see. When you hit a CHI from a given edge, the
@@ -692,48 +396,7 @@ private:
// they form a list of anticipable values. OutValues contains CHIs
// corresponding to each basic block.
void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
- HoistingPointList &HPL) {
- auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
-
- // CHIArgs now have the outgoing values, so check for anticipability and
- // accumulate hoistable candidates in HPL.
- for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
- BasicBlock *BB = A.first;
- SmallVectorImpl<CHIArg> &CHIs = A.second;
- // Vector of PHIs contains PHIs for different instructions.
- // Sort the args according to their VNs, such that identical
- // instructions are together.
- llvm::stable_sort(CHIs, cmpVN);
- auto TI = BB->getTerminator();
- auto B = CHIs.begin();
- // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
- auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
- [B](CHIArg &A) { return A != *B; });
- auto PrevIt = CHIs.begin();
- while (PrevIt != PHIIt) {
- // Collect values which satisfy safety checks.
- SmallVector<CHIArg, 2> Safe;
- // We check for safety first because there might be multiple values in
- // the same path, some of which are not safe to be hoisted, but overall
- // each edge has at least one value which can be hoisted, making the
- // value anticipable along that path.
- checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
-
- // List of safe values should be anticipable at TI.
- if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
- HPL.push_back({BB, SmallVecInsn()});
- SmallVecInsn &V = HPL.back().second;
- for (auto B : Safe)
- V.push_back(B.I);
- }
-
- // Check other VNs
- PrevIt = PHIIt;
- PHIIt = std::find_if(PrevIt, CHIs.end(),
- [PrevIt](CHIArg &A) { return A != *PrevIt; });
- }
- }
- }
+ HoistingPointList &HPL);
// Compute insertion points for each values which can be fully anticipated at
// a dominator. HPL contains all such values.
@@ -791,14 +454,14 @@ private:
}
// Insert empty CHI node for this VN. This is used to factor out
// basic blocks where the ANTIC can potentially change.
- for (auto IDFB : IDFBlocks) {
+ CHIArg EmptyChi = {VN, nullptr, nullptr};
+ for (auto *IDFBB : IDFBlocks) {
for (unsigned i = 0; i < V.size(); ++i) {
- CHIArg C = {VN, nullptr, nullptr};
- // Ignore spurious PDFs.
- if (DT->properlyDominates(IDFB, V[i]->getParent())) {
- OutValue[IDFB].push_back(C);
- LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
- << ", for Insn: " << *V[i]);
+ // Ignore spurious PDFs.
+ if (DT->properlyDominates(IDFBB, V[i]->getParent())) {
+ OutValue[IDFBB].push_back(EmptyChi);
+ LLVM_DEBUG(dbgs() << "\nInserting a CHI for BB: "
+ << IDFBB->getName() << ", for Insn: " << *V[i]);
}
}
}
@@ -816,364 +479,754 @@ private:
// a load without hoisting its access function. So before hoisting any
// expression, make sure that all its operands are available at insert point.
bool allOperandsAvailable(const Instruction *I,
- const BasicBlock *HoistPt) const {
- for (const Use &Op : I->operands())
- if (const auto *Inst = dyn_cast<Instruction>(&Op))
- if (!DT->dominates(Inst->getParent(), HoistPt))
- return false;
-
- return true;
- }
+ const BasicBlock *HoistPt) const;
// Same as allOperandsAvailable with recursive check for GEP operands.
bool allGepOperandsAvailable(const Instruction *I,
- const BasicBlock *HoistPt) const {
- for (const Use &Op : I->operands())
- if (const auto *Inst = dyn_cast<Instruction>(&Op))
- if (!DT->dominates(Inst->getParent(), HoistPt)) {
- if (const GetElementPtrInst *GepOp =
- dyn_cast<GetElementPtrInst>(Inst)) {
- if (!allGepOperandsAvailable(GepOp, HoistPt))
- return false;
- // Gep is available if all operands of GepOp are available.
- } else {
- // Gep is not available if it has operands other than GEPs that are
- // defined in blocks not dominating HoistPt.
- return false;
- }
- }
- return true;
- }
+ const BasicBlock *HoistPt) const;
// Make all operands of the GEP available.
void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
const SmallVecInsn &InstructionsToHoist,
- Instruction *Gep) const {
- assert(allGepOperandsAvailable(Gep, HoistPt) &&
- "GEP operands not available");
-
- Instruction *ClonedGep = Gep->clone();
- for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
- if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
- // Check whether the operand is already available.
- if (DT->dominates(Op->getParent(), HoistPt))
- continue;
+ Instruction *Gep) const;
+
+ void updateAlignment(Instruction *I, Instruction *Repl);
+
+ // Remove all the instructions in Candidates and replace their usage with
+ // Repl. Returns the number of instructions removed.
+ unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+ MemoryUseOrDef *NewMemAcc);
+
+ // Replace all Memory PHI usage with NewMemAcc.
+ void raMPHIuw(MemoryUseOrDef *NewMemAcc);
+
+ // Remove all other instructions and replace them with Repl.
+ unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
+ BasicBlock *DestBB, bool MoveAccess);
+
+ // In the case Repl is a load or a store, we make all their GEPs
+ // available: GEPs are not hoisted by default to avoid the address
+ // computations to be hoisted without the associated load or store.
+ bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist) const;
+
+ std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL);
+
+ // Hoist all expressions. Returns Number of scalars hoisted
+ // and number of non-scalars hoisted.
+ std::pair<unsigned, unsigned> hoistExpressions(Function &F);
+};
+
+class GVNHoistLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ GVNHoistLegacyPass() : FunctionPass(ID) {
+ initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+
+ GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
+ return G.run(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+bool GVNHoist::run(Function &F) {
+ NumFuncArgs = F.arg_size();
+ VN.setDomTree(DT);
+ VN.setAliasAnalysis(AA);
+ VN.setMemDep(MD);
+ bool Res = false;
+ // Perform DFS Numbering of instructions.
+ unsigned BBI = 0;
+ for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ DFSNumber[BB] = ++BBI;
+ unsigned I = 0;
+ for (auto &Inst : *BB)
+ DFSNumber[&Inst] = ++I;
+ }
+
+ int ChainLength = 0;
+
+ // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+ while (true) {
+ if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
+ return Res;
+
+ auto HoistStat = hoistExpressions(F);
+ if (HoistStat.first + HoistStat.second == 0)
+ return Res;
+
+ if (HoistStat.second > 0)
+ // To address a limitation of the current GVN, we need to rerun the
+ // hoisting after we hoisted loads or stores in order to be able to
+ // hoist all scalars dependent on the hoisted ld/st.
+ VN.clear();
+
+ Res = true;
+ }
+
+ return Res;
+}
+
+unsigned int GVNHoist::rank(const Value *V) const {
+ // Prefer constants to undef to anything else
+ // Undef is a constant, have to check it first.
+ // Prefer smaller constants to constantexprs
+ if (isa<ConstantExpr>(V))
+ return 2;
+ if (isa<UndefValue>(V))
+ return 1;
+ if (isa<Constant>(V))
+ return 0;
+ else if (auto *A = dyn_cast<Argument>(V))
+ return 3 + A->getArgNo();
+
+ // Need to shift the instruction DFS by number of arguments + 3 to account
+ // for the constant and argument ranking above.
+ auto Result = DFSNumber.lookup(V);
+ if (Result > 0)
+ return 4 + NumFuncArgs + Result;
+ // Unreachable or something else, just return a really large number.
+ return ~0;
+}
+
+bool GVNHoist::hasEH(const BasicBlock *BB) {
+ auto It = BBSideEffects.find(BB);
+ if (It != BBSideEffects.end())
+ return It->second;
+
+ if (BB->isEHPad() || BB->hasAddressTaken()) {
+ BBSideEffects[BB] = true;
+ return true;
+ }
+
+ if (BB->getTerminator()->mayThrow()) {
+ BBSideEffects[BB] = true;
+ return true;
+ }
+
+ BBSideEffects[BB] = false;
+ return false;
+}
+
+bool GVNHoist::hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+ const BasicBlock *BB) {
+ const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
+ if (!Acc)
+ return false;
+
+ Instruction *OldPt = Def->getMemoryInst();
+ const BasicBlock *OldBB = OldPt->getParent();
+ const BasicBlock *NewBB = NewPt->getParent();
+ bool ReachedNewPt = false;
- // As a GEP can refer to other GEPs, recursively make all the operands
- // of this GEP available at HoistPt.
- if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
- makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
+ for (const MemoryAccess &MA : *Acc)
+ if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
+ Instruction *Insn = MU->getMemoryInst();
+
+ // Do not check whether MU aliases Def when MU occurs after OldPt.
+ if (BB == OldBB && firstInBB(OldPt, Insn))
+ break;
+
+ // Do not check whether MU aliases Def when MU occurs before NewPt.
+ if (BB == NewBB) {
+ if (!ReachedNewPt) {
+ if (firstInBB(Insn, NewPt))
+ continue;
+ ReachedNewPt = true;
+ }
}
+ if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
+ return true;
+ }
+
+ return false;
+}
+
+bool GVNHoist::hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+ int &NBBsOnAllPaths) {
+ // Stop walk once the limit is reached.
+ if (NBBsOnAllPaths == 0)
+ return true;
+
+ // Impossible to hoist with exceptions on the path.
+ if (hasEH(BB))
+ return true;
+
+ // No such instruction after HoistBarrier in a basic block was
+ // selected for hoisting so instructions selected within basic block with
+ // a hoist barrier can be hoisted.
+ if ((BB != SrcBB) && HoistBarrier.count(BB))
+ return true;
+
+ return false;
+}
- // Copy Gep and replace its uses in Repl with ClonedGep.
- ClonedGep->insertBefore(HoistPt->getTerminator());
-
- // Conservatively discard any optimization hints, they may differ on the
- // other paths.
- ClonedGep->dropUnknownNonDebugMetadata();
-
- // If we have optimization hints which agree with each other along different
- // paths, preserve them.
- for (const Instruction *OtherInst : InstructionsToHoist) {
- const GetElementPtrInst *OtherGep;
- if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
- OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
- else
- OtherGep = cast<GetElementPtrInst>(
- cast<StoreInst>(OtherInst)->getPointerOperand());
- ClonedGep->andIRFlags(OtherGep);
+bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+ int &NBBsOnAllPaths) {
+ const BasicBlock *NewBB = NewPt->getParent();
+ const BasicBlock *OldBB = Def->getBlock();
+ assert(DT->dominates(NewBB, OldBB) && "invalid path");
+ assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
+ "def does not dominate new hoisting point");
+
+ // Walk all basic blocks reachable in depth-first iteration on the inverse
+ // CFG from OldBB to NewBB. These blocks are all the blocks that may be
+ // executed between the execution of NewBB and OldBB. Hoisting an expression
+ // from OldBB into NewBB has to be safe on all execution paths.
+ for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
+ const BasicBlock *BB = *I;
+ if (BB == NewBB) {
+ // Stop traversal when reaching HoistPt.
+ I.skipChildren();
+ continue;
}
- // Replace uses of Gep with ClonedGep in Repl.
- Repl->replaceUsesOfWith(Gep, ClonedGep);
+ if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
+ return true;
+
+ // Check that we do not move a store past loads.
+ if (hasMemoryUse(NewPt, Def, BB))
+ return true;
+
+ // -1 is unlimited number of blocks on all paths.
+ if (NBBsOnAllPaths != -1)
+ --NBBsOnAllPaths;
+
+ ++I;
}
- void updateAlignment(Instruction *I, Instruction *Repl) {
- if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
- ReplacementLoad->setAlignment(
- std::min(ReplacementLoad->getAlign(), cast<LoadInst>(I)->getAlign()));
- ++NumLoadsRemoved;
- } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
- ReplacementStore->setAlignment(std::min(ReplacementStore->getAlign(),
- cast<StoreInst>(I)->getAlign()));
- ++NumStoresRemoved;
- } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
- ReplacementAlloca->setAlignment(std::max(
- ReplacementAlloca->getAlign(), cast<AllocaInst>(I)->getAlign()));
- } else if (isa<CallInst>(Repl)) {
- ++NumCallsRemoved;
+ return false;
+}
+
+bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+ int &NBBsOnAllPaths) {
+ assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
+
+ // Walk all basic blocks reachable in depth-first iteration on
+ // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
+ // blocks that may be executed between the execution of NewHoistPt and
+ // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
+ // on all execution paths.
+ for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+ const BasicBlock *BB = *I;
+ if (BB == HoistPt) {
+ // Stop traversal when reaching NewHoistPt.
+ I.skipChildren();
+ continue;
}
+
+ if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
+ return true;
+
+ // -1 is unlimited number of blocks on all paths.
+ if (NBBsOnAllPaths != -1)
+ --NBBsOnAllPaths;
+
+ ++I;
}
- // Remove all the instructions in Candidates and replace their usage with Repl.
- // Returns the number of instructions removed.
- unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
- MemoryUseOrDef *NewMemAcc) {
- unsigned NR = 0;
- for (Instruction *I : Candidates) {
- if (I != Repl) {
- ++NR;
- updateAlignment(I, Repl);
- if (NewMemAcc) {
- // Update the uses of the old MSSA access with NewMemAcc.
- MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
- OldMA->replaceAllUsesWith(NewMemAcc);
- MSSAUpdater->removeMemoryAccess(OldMA);
- }
+ return false;
+}
- Repl->andIRFlags(I);
- combineKnownMetadata(Repl, I);
- I->replaceAllUsesWith(Repl);
- // Also invalidate the Alias Analysis cache.
- MD->removeInstruction(I);
- I->eraseFromParent();
- }
+bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
+ const Instruction *OldPt, MemoryUseOrDef *U,
+ GVNHoist::InsKind K, int &NBBsOnAllPaths) {
+ // In place hoisting is safe.
+ if (NewPt == OldPt)
+ return true;
+
+ const BasicBlock *NewBB = NewPt->getParent();
+ const BasicBlock *OldBB = OldPt->getParent();
+ const BasicBlock *UBB = U->getBlock();
+
+ // Check for dependences on the Memory SSA.
+ MemoryAccess *D = U->getDefiningAccess();
+ BasicBlock *DBB = D->getBlock();
+ if (DT->properlyDominates(NewBB, DBB))
+ // Cannot move the load or store to NewBB above its definition in DBB.
+ return false;
+
+ if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
+ if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
+ if (!firstInBB(UD->getMemoryInst(), NewPt))
+ // Cannot move the load or store to NewPt above its definition in D.
+ return false;
+
+ // Check for unsafe hoistings due to side effects.
+ if (K == InsKind::Store) {
+ if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
+ return false;
+ } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
+ return false;
+
+ if (UBB == NewBB) {
+ if (DT->properlyDominates(DBB, NewBB))
+ return true;
+ assert(UBB == DBB);
+ assert(MSSA->locallyDominates(D, U));
+ }
+
+ // No side effects: it is safe to hoist.
+ return true;
+}
+
+bool GVNHoist::valueAnticipable(CHIArgs C, Instruction *TI) const {
+ if (TI->getNumSuccessors() > (unsigned)size(C))
+ return false; // Not enough args in this CHI.
+
+ for (auto CHI : C) {
+ // Find if all the edges have values flowing out of BB.
+ if (!llvm::is_contained(successors(TI), CHI.Dest))
+ return false;
+ }
+ return true;
+}
+
+void GVNHoist::checkSafety(CHIArgs C, BasicBlock *BB, GVNHoist::InsKind K,
+ SmallVectorImpl<CHIArg> &Safe) {
+ int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+ for (auto CHI : C) {
+ Instruction *Insn = CHI.I;
+ if (!Insn) // No instruction was inserted in this CHI.
+ continue;
+ if (K == InsKind::Scalar) {
+ if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
+ Safe.push_back(CHI);
+ } else {
+ auto *T = BB->getTerminator();
+ if (MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn))
+ if (safeToHoistLdSt(T, Insn, UD, K, NumBBsOnAllPaths))
+ Safe.push_back(CHI);
}
- return NR;
}
+}
- // Replace all Memory PHI usage with NewMemAcc.
- void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
- SmallPtrSet<MemoryPhi *, 4> UsePhis;
- for (User *U : NewMemAcc->users())
- if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
- UsePhis.insert(Phi);
-
- for (MemoryPhi *Phi : UsePhis) {
- auto In = Phi->incoming_values();
- if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
- Phi->replaceAllUsesWith(NewMemAcc);
- MSSAUpdater->removeMemoryAccess(Phi);
- }
+void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+ GVNHoist::RenameStackType &RenameStack) {
+ auto it1 = ValueBBs.find(BB);
+ if (it1 != ValueBBs.end()) {
+ // Iterate in reverse order to keep lower ranked values on the top.
+ for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
+ // Get the value of instruction I
+ LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+ RenameStack[VI.first].push_back(VI.second);
}
}
+}
- // Remove all other instructions and replace them with Repl.
- unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
- BasicBlock *DestBB, bool MoveAccess) {
- MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
- if (MoveAccess && NewMemAcc) {
- // The definition of this ld/st will not change: ld/st hoisting is
- // legal when the ld/st is not moved past its current definition.
- MSSAUpdater->moveToPlace(NewMemAcc, DestBB,
- MemorySSA::BeforeTerminator);
+void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+ GVNHoist::RenameStackType &RenameStack) {
+ // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
+ for (auto Pred : predecessors(BB)) {
+ auto P = CHIBBs.find(Pred);
+ if (P == CHIBBs.end()) {
+ continue;
}
+ LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+ // A CHI is found (BB -> Pred is an edge in the CFG)
+ // Pop the stack until Top(V) = Ve.
+ auto &VCHI = P->second;
+ for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
+ CHIArg &C = *It;
+ if (!C.Dest) {
+ auto si = RenameStack.find(C.VN);
+ // The Basic Block where CHI is must dominate the value we want to
+ // track in a CHI. In the PDom walk, there can be values in the
+ // stack which are not control dependent e.g., nested loop.
+ if (si != RenameStack.end() && si->second.size() &&
+ DT->properlyDominates(Pred, si->second.back()->getParent())) {
+ C.Dest = BB; // Assign the edge
+ C.I = si->second.pop_back_val(); // Assign the argument
+ LLVM_DEBUG(dbgs()
+ << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
+ << ", VN: " << C.VN.first << ", " << C.VN.second);
+ }
+ // Move to next CHI of a different value
+ It = std::find_if(It, VCHI.end(), [It](CHIArg &A) { return A != *It; });
+ } else
+ ++It;
+ }
+ }
+}
- // Replace all other instructions with Repl with memory access NewMemAcc.
- unsigned NR = rauw(Candidates, Repl, NewMemAcc);
+void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs,
+ GVNHoist::InsKind K,
+ HoistingPointList &HPL) {
+ auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
+
+ // CHIArgs now have the outgoing values, so check for anticipability and
+ // accumulate hoistable candidates in HPL.
+ for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
+ BasicBlock *BB = A.first;
+ SmallVectorImpl<CHIArg> &CHIs = A.second;
+ // Vector of PHIs contains PHIs for different instructions.
+ // Sort the args according to their VNs, such that identical
+ // instructions are together.
+ llvm::stable_sort(CHIs, cmpVN);
+ auto TI = BB->getTerminator();
+ auto B = CHIs.begin();
+ // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
+ auto PHIIt = llvm::find_if(CHIs, [B](CHIArg &A) { return A != *B; });
+ auto PrevIt = CHIs.begin();
+ while (PrevIt != PHIIt) {
+ // Collect values which satisfy safety checks.
+ SmallVector<CHIArg, 2> Safe;
+ // We check for safety first because there might be multiple values in
+ // the same path, some of which are not safe to be hoisted, but overall
+ // each edge has at least one value which can be hoisted, making the
+ // value anticipable along that path.
+ checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
+
+ // List of safe values should be anticipable at TI.
+ if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
+ HPL.push_back({BB, SmallVecInsn()});
+ SmallVecInsn &V = HPL.back().second;
+ for (auto B : Safe)
+ V.push_back(B.I);
+ }
- // Remove MemorySSA phi nodes with the same arguments.
- if (NewMemAcc)
- raMPHIuw(NewMemAcc);
- return NR;
+ // Check other VNs
+ PrevIt = PHIIt;
+ PHIIt = std::find_if(PrevIt, CHIs.end(),
+ [PrevIt](CHIArg &A) { return A != *PrevIt; });
+ }
}
+}
- // In the case Repl is a load or a store, we make all their GEPs
- // available: GEPs are not hoisted by default to avoid the address
- // computations to be hoisted without the associated load or store.
- bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
- const SmallVecInsn &InstructionsToHoist) const {
- // Check whether the GEP of a ld/st can be synthesized at HoistPt.
- GetElementPtrInst *Gep = nullptr;
- Instruction *Val = nullptr;
- if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
- Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
- } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
- Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
- Val = dyn_cast<Instruction>(St->getValueOperand());
- // Check that the stored value is available.
- if (Val) {
- if (isa<GetElementPtrInst>(Val)) {
- // Check whether we can compute the GEP at HoistPt.
- if (!allGepOperandsAvailable(Val, HoistPt))
+bool GVNHoist::allOperandsAvailable(const Instruction *I,
+ const BasicBlock *HoistPt) const {
+ for (const Use &Op : I->operands())
+ if (const auto *Inst = dyn_cast<Instruction>(&Op))
+ if (!DT->dominates(Inst->getParent(), HoistPt))
+ return false;
+
+ return true;
+}
+
+bool GVNHoist::allGepOperandsAvailable(const Instruction *I,
+ const BasicBlock *HoistPt) const {
+ for (const Use &Op : I->operands())
+ if (const auto *Inst = dyn_cast<Instruction>(&Op))
+ if (!DT->dominates(Inst->getParent(), HoistPt)) {
+ if (const GetElementPtrInst *GepOp =
+ dyn_cast<GetElementPtrInst>(Inst)) {
+ if (!allGepOperandsAvailable(GepOp, HoistPt))
return false;
- } else if (!DT->dominates(Val->getParent(), HoistPt))
+ // Gep is available if all operands of GepOp are available.
+ } else {
+ // Gep is not available if it has operands other than GEPs that are
+ // defined in blocks not dominating HoistPt.
return false;
+ }
}
- }
+ return true;
+}
- // Check whether we can compute the Gep at HoistPt.
- if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
- return false;
+void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist,
+ Instruction *Gep) const {
+ assert(allGepOperandsAvailable(Gep, HoistPt) && "GEP operands not available");
- makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
+ Instruction *ClonedGep = Gep->clone();
+ for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
+ if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
+ // Check whether the operand is already available.
+ if (DT->dominates(Op->getParent(), HoistPt))
+ continue;
- if (Val && isa<GetElementPtrInst>(Val))
- makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
+ // As a GEP can refer to other GEPs, recursively make all the operands
+ // of this GEP available at HoistPt.
+ if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
+ makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
+ }
- return true;
- }
+ // Copy Gep and replace its uses in Repl with ClonedGep.
+ ClonedGep->insertBefore(HoistPt->getTerminator());
- std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
- unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
- for (const HoistingPointInfo &HP : HPL) {
- // Find out whether we already have one of the instructions in HoistPt,
- // in which case we do not have to move it.
- BasicBlock *DestBB = HP.first;
- const SmallVecInsn &InstructionsToHoist = HP.second;
- Instruction *Repl = nullptr;
- for (Instruction *I : InstructionsToHoist)
- if (I->getParent() == DestBB)
- // If there are two instructions in HoistPt to be hoisted in place:
- // update Repl to be the first one, such that we can rename the uses
- // of the second based on the first.
- if (!Repl || firstInBB(I, Repl))
- Repl = I;
-
- // Keep track of whether we moved the instruction so we know whether we
- // should move the MemoryAccess.
- bool MoveAccess = true;
- if (Repl) {
- // Repl is already in HoistPt: it remains in place.
- assert(allOperandsAvailable(Repl, DestBB) &&
- "instruction depends on operands that are not available");
- MoveAccess = false;
- } else {
- // When we do not find Repl in HoistPt, select the first in the list
- // and move it to HoistPt.
- Repl = InstructionsToHoist.front();
-
- // We can move Repl in HoistPt only when all operands are available.
- // The order in which hoistings are done may influence the availability
- // of operands.
- if (!allOperandsAvailable(Repl, DestBB)) {
- // When HoistingGeps there is nothing more we can do to make the
- // operands available: just continue.
- if (HoistingGeps)
- continue;
+ // Conservatively discard any optimization hints, they may differ on the
+ // other paths.
+ ClonedGep->dropUnknownNonDebugMetadata();
- // When not HoistingGeps we need to copy the GEPs.
- if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
- continue;
- }
+ // If we have optimization hints which agree with each other along different
+ // paths, preserve them.
+ for (const Instruction *OtherInst : InstructionsToHoist) {
+ const GetElementPtrInst *OtherGep;
+ if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
+ OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
+ else
+ OtherGep = cast<GetElementPtrInst>(
+ cast<StoreInst>(OtherInst)->getPointerOperand());
+ ClonedGep->andIRFlags(OtherGep);
+ }
- // Move the instruction at the end of HoistPt.
- Instruction *Last = DestBB->getTerminator();
- MD->removeInstruction(Repl);
- Repl->moveBefore(Last);
+ // Replace uses of Gep with ClonedGep in Repl.
+ Repl->replaceUsesOfWith(Gep, ClonedGep);
+}
- DFSNumber[Repl] = DFSNumber[Last]++;
+void GVNHoist::updateAlignment(Instruction *I, Instruction *Repl) {
+ if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+ ReplacementLoad->setAlignment(
+ std::min(ReplacementLoad->getAlign(), cast<LoadInst>(I)->getAlign()));
+ ++NumLoadsRemoved;
+ } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+ ReplacementStore->setAlignment(
+ std::min(ReplacementStore->getAlign(), cast<StoreInst>(I)->getAlign()));
+ ++NumStoresRemoved;
+ } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+ ReplacementAlloca->setAlignment(std::max(ReplacementAlloca->getAlign(),
+ cast<AllocaInst>(I)->getAlign()));
+ } else if (isa<CallInst>(Repl)) {
+ ++NumCallsRemoved;
+ }
+}
+
+unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+ MemoryUseOrDef *NewMemAcc) {
+ unsigned NR = 0;
+ for (Instruction *I : Candidates) {
+ if (I != Repl) {
+ ++NR;
+ updateAlignment(I, Repl);
+ if (NewMemAcc) {
+ // Update the uses of the old MSSA access with NewMemAcc.
+ MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+ OldMA->replaceAllUsesWith(NewMemAcc);
+ MSSAUpdater->removeMemoryAccess(OldMA);
}
- NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
+ Repl->andIRFlags(I);
+ combineKnownMetadata(Repl, I);
+ I->replaceAllUsesWith(Repl);
+ // Also invalidate the Alias Analysis cache.
+ MD->removeInstruction(I);
+ I->eraseFromParent();
+ }
+ }
+ return NR;
+}
- if (isa<LoadInst>(Repl))
- ++NL;
- else if (isa<StoreInst>(Repl))
- ++NS;
- else if (isa<CallInst>(Repl))
- ++NC;
- else // Scalar
- ++NI;
+void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
+ SmallPtrSet<MemoryPhi *, 4> UsePhis;
+ for (User *U : NewMemAcc->users())
+ if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+ UsePhis.insert(Phi);
+
+ for (MemoryPhi *Phi : UsePhis) {
+ auto In = Phi->incoming_values();
+ if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+ Phi->replaceAllUsesWith(NewMemAcc);
+ MSSAUpdater->removeMemoryAccess(Phi);
}
+ }
+}
+
+unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
+ Instruction *Repl, BasicBlock *DestBB,
+ bool MoveAccess) {
+ MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
+ if (MoveAccess && NewMemAcc) {
+ // The definition of this ld/st will not change: ld/st hoisting is
+ // legal when the ld/st is not moved past its current definition.
+ MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::BeforeTerminator);
+ }
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
+ // Replace all other instructions with Repl with memory access NewMemAcc.
+ unsigned NR = rauw(Candidates, Repl, NewMemAcc);
- NumHoisted += NL + NS + NC + NI;
- NumRemoved += NR;
- NumLoadsHoisted += NL;
- NumStoresHoisted += NS;
- NumCallsHoisted += NC;
- return {NI, NL + NC + NS};
+ // Remove MemorySSA phi nodes with the same arguments.
+ if (NewMemAcc)
+ raMPHIuw(NewMemAcc);
+ return NR;
+}
+
+bool GVNHoist::makeGepOperandsAvailable(
+ Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist) const {
+ // Check whether the GEP of a ld/st can be synthesized at HoistPt.
+ GetElementPtrInst *Gep = nullptr;
+ Instruction *Val = nullptr;
+ if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
+ Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
+ } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
+ Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
+ Val = dyn_cast<Instruction>(St->getValueOperand());
+ // Check that the stored value is available.
+ if (Val) {
+ if (isa<GetElementPtrInst>(Val)) {
+ // Check whether we can compute the GEP at HoistPt.
+ if (!allGepOperandsAvailable(Val, HoistPt))
+ return false;
+ } else if (!DT->dominates(Val->getParent(), HoistPt))
+ return false;
+ }
}
- // Hoist all expressions. Returns Number of scalars hoisted
- // and number of non-scalars hoisted.
- std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
- InsnInfo II;
- LoadInfo LI;
- StoreInfo SI;
- CallInfo CI;
- for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
- int InstructionNb = 0;
- for (Instruction &I1 : *BB) {
- // If I1 cannot guarantee progress, subsequent instructions
- // in BB cannot be hoisted anyways.
- if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
- HoistBarrier.insert(BB);
- break;
- }
- // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
- // deeper may increase the register pressure and compilation time.
- if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
- break;
+ // Check whether we can compute the Gep at HoistPt.
+ if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
+ return false;
- // Do not value number terminator instructions.
- if (I1.isTerminator())
- break;
+ makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
- if (auto *Load = dyn_cast<LoadInst>(&I1))
- LI.insert(Load, VN);
- else if (auto *Store = dyn_cast<StoreInst>(&I1))
- SI.insert(Store, VN);
- else if (auto *Call = dyn_cast<CallInst>(&I1)) {
- if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
- if (isa<DbgInfoIntrinsic>(Intr) ||
- Intr->getIntrinsicID() == Intrinsic::assume ||
- Intr->getIntrinsicID() == Intrinsic::sideeffect)
- continue;
- }
- if (Call->mayHaveSideEffects())
- break;
-
- if (Call->isConvergent())
- break;
-
- CI.insert(Call, VN);
- } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
- // Do not hoist scalars past calls that may write to memory because
- // that could result in spills later. geps are handled separately.
- // TODO: We can relax this for targets like AArch64 as they have more
- // registers than X86.
- II.insert(&I1, VN);
+ if (Val && isa<GetElementPtrInst>(Val))
+ makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
+
+ return true;
+}
+
+std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
+ unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
+ for (const HoistingPointInfo &HP : HPL) {
+ // Find out whether we already have one of the instructions in HoistPt,
+ // in which case we do not have to move it.
+ BasicBlock *DestBB = HP.first;
+ const SmallVecInsn &InstructionsToHoist = HP.second;
+ Instruction *Repl = nullptr;
+ for (Instruction *I : InstructionsToHoist)
+ if (I->getParent() == DestBB)
+ // If there are two instructions in HoistPt to be hoisted in place:
+ // update Repl to be the first one, such that we can rename the uses
+ // of the second based on the first.
+ if (!Repl || firstInBB(I, Repl))
+ Repl = I;
+
+ // Keep track of whether we moved the instruction so we know whether we
+ // should move the MemoryAccess.
+ bool MoveAccess = true;
+ if (Repl) {
+ // Repl is already in HoistPt: it remains in place.
+ assert(allOperandsAvailable(Repl, DestBB) &&
+ "instruction depends on operands that are not available");
+ MoveAccess = false;
+ } else {
+ // When we do not find Repl in HoistPt, select the first in the list
+ // and move it to HoistPt.
+ Repl = InstructionsToHoist.front();
+
+ // We can move Repl in HoistPt only when all operands are available.
+ // The order in which hoistings are done may influence the availability
+ // of operands.
+ if (!allOperandsAvailable(Repl, DestBB)) {
+ // When HoistingGeps there is nothing more we can do to make the
+ // operands available: just continue.
+ if (HoistingGeps)
+ continue;
+
+ // When not HoistingGeps we need to copy the GEPs.
+ if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
+ continue;
}
+
+ // Move the instruction at the end of HoistPt.
+ Instruction *Last = DestBB->getTerminator();
+ MD->removeInstruction(Repl);
+ Repl->moveBefore(Last);
+
+ DFSNumber[Repl] = DFSNumber[Last]++;
}
- HoistingPointList HPL;
- computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
- computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
- computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
- computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
- computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
- computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
- return hoist(HPL);
+ NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
+
+ if (isa<LoadInst>(Repl))
+ ++NL;
+ else if (isa<StoreInst>(Repl))
+ ++NS;
+ else if (isa<CallInst>(Repl))
+ ++NC;
+ else // Scalar
+ ++NI;
}
-};
-class GVNHoistLegacyPass : public FunctionPass {
-public:
- static char ID;
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
- GVNHoistLegacyPass() : FunctionPass(ID) {
- initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
- }
+ NumHoisted += NL + NS + NC + NI;
+ NumRemoved += NR;
+ NumLoadsHoisted += NL;
+ NumStoresHoisted += NS;
+ NumCallsHoisted += NC;
+ return {NI, NL + NC + NS};
+}
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
- auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
+ InsnInfo II;
+ LoadInfo LI;
+ StoreInfo SI;
+ CallInfo CI;
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ int InstructionNb = 0;
+ for (Instruction &I1 : *BB) {
+ // If I1 cannot guarantee progress, subsequent instructions
+ // in BB cannot be hoisted anyways.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+ HoistBarrier.insert(BB);
+ break;
+ }
+ // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
+ // deeper may increase the register pressure and compilation time.
+ if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
+ break;
+
+ // Do not value number terminator instructions.
+ if (I1.isTerminator())
+ break;
+
+ if (auto *Load = dyn_cast<LoadInst>(&I1))
+ LI.insert(Load, VN);
+ else if (auto *Store = dyn_cast<StoreInst>(&I1))
+ SI.insert(Store, VN);
+ else if (auto *Call = dyn_cast<CallInst>(&I1)) {
+ if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
+ if (isa<DbgInfoIntrinsic>(Intr) ||
+ Intr->getIntrinsicID() == Intrinsic::assume ||
+ Intr->getIntrinsicID() == Intrinsic::sideeffect)
+ continue;
+ }
+ if (Call->mayHaveSideEffects())
+ break;
- GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
- return G.run(F);
- }
+ if (Call->isConvergent())
+ break;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
+ CI.insert(Call, VN);
+ } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
+ // Do not hoist scalars past calls that may write to memory because
+ // that could result in spills later. geps are handled separately.
+ // TODO: We can relax this for targets like AArch64 as they have more
+ // registers than X86.
+ II.insert(&I1, VN);
+ }
}
-};
+
+ HoistingPointList HPL;
+ computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
+ computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
+ computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
+ computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+ computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
+ computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
+ return hoist(HPL);
+}
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index dfb4b7e038ba..aef927ab6558 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -158,8 +158,7 @@ public:
void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
for (auto II = Insts.begin(); II != Insts.end();) {
- if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
- Blocks.end()) {
+ if (!llvm::is_contained(Blocks, (*II)->getParent())) {
ActiveBlocks.remove((*II)->getParent());
II = Insts.erase(II);
} else {
@@ -277,8 +276,7 @@ public:
auto VI = Values.begin();
while (BI != Blocks.end()) {
assert(VI != Values.end());
- if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
- NewBlocks.end()) {
+ if (!llvm::is_contained(NewBlocks, *BI)) {
BI = Blocks.erase(BI);
VI = Values.erase(VI);
} else {
@@ -694,10 +692,8 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
ModelledPHI NewPHI(NewInsts, ActivePreds);
// Does sinking this instruction render previous PHIs redundant?
- if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
- NeededPHIs.erase(NewPHI);
+ if (NeededPHIs.erase(NewPHI))
RecomputePHIContents = true;
- }
if (RecomputePHIContents) {
// The needed PHIs have changed, so recompute the set of all needed
@@ -758,8 +754,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
Cand.NumMemoryInsts = MemoryInstNum;
Cand.NumBlocks = ActivePreds.size();
Cand.NumPHIs = NeededPHIs.size();
- for (auto *C : ActivePreds)
- Cand.Blocks.push_back(C);
+ append_range(Cand.Blocks, ActivePreds);
return Cand;
}
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index a3eba27a4d90..61eb4ce0ed46 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -347,9 +347,8 @@ bool GuardWideningImpl::eliminateInstrViaWidening(
const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
auto I = GuardsInCurBB.begin();
- auto E = Instr->getParent() == CurBB
- ? std::find(GuardsInCurBB.begin(), GuardsInCurBB.end(), Instr)
- : GuardsInCurBB.end();
+ auto E = Instr->getParent() == CurBB ? find(GuardsInCurBB, Instr)
+ : GuardsInCurBB.end();
#ifndef NDEBUG
{
@@ -666,13 +665,12 @@ bool GuardWideningImpl::combineRangeChecks(
};
copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
- Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
+ erase_if(Checks, IsCurrentCheck);
assert(CurrentChecks.size() != 0 && "We know we have at least one!");
if (CurrentChecks.size() < 3) {
- RangeChecksOut.insert(RangeChecksOut.end(), CurrentChecks.begin(),
- CurrentChecks.end());
+ llvm::append_range(RangeChecksOut, CurrentChecks);
continue;
}
@@ -700,9 +698,7 @@ bool GuardWideningImpl::combineRangeChecks(
return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
};
- if (MaxDiff.isMinValue() ||
- !std::all_of(std::next(CurrentChecks.begin()), CurrentChecks.end(),
- OffsetOK))
+ if (MaxDiff.isMinValue() || !all_of(drop_begin(CurrentChecks), OffsetOK))
return false;
// We have a series of f+1 checks as:
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 0f36c3f772e6..ae1fff0fa844 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -131,6 +131,10 @@ static cl::opt<bool>
LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
cl::desc("Predicate conditions in read only loops"));
+static cl::opt<bool>
+AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
+ cl::desc("Allow widening of indvars to eliminate s/zext"));
+
namespace {
struct RewritePhi;
@@ -145,6 +149,7 @@ class IndVarSimplify {
std::unique_ptr<MemorySSAUpdater> MSSAU;
SmallVector<WeakTrackingVH, 16> DeadInsts;
+ bool WidenIndVars;
bool handleFloatingPointIV(Loop *L, PHINode *PH);
bool rewriteNonIntegerIVs(Loop *L);
@@ -167,8 +172,9 @@ class IndVarSimplify {
public:
IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
const DataLayout &DL, TargetLibraryInfo *TLI,
- TargetTransformInfo *TTI, MemorySSA *MSSA)
- : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {
+ TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars)
+ : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI),
+ WidenIndVars(WidenIndVars) {
if (MSSA)
MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
}
@@ -178,57 +184,6 @@ public:
} // end anonymous namespace
-/// Determine the insertion point for this user. By default, insert immediately
-/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
-/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
-/// common dominator for the incoming blocks. A nullptr can be returned if no
-/// viable location is found: it may happen if User is a PHI and Def only comes
-/// to this PHI from unreachable blocks.
-static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
- DominatorTree *DT, LoopInfo *LI) {
- PHINode *PHI = dyn_cast<PHINode>(User);
- if (!PHI)
- return User;
-
- Instruction *InsertPt = nullptr;
- for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
- if (PHI->getIncomingValue(i) != Def)
- continue;
-
- BasicBlock *InsertBB = PHI->getIncomingBlock(i);
-
- if (!DT->isReachableFromEntry(InsertBB))
- continue;
-
- if (!InsertPt) {
- InsertPt = InsertBB->getTerminator();
- continue;
- }
- InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
- InsertPt = InsertBB->getTerminator();
- }
-
- // If we have skipped all inputs, it means that Def only comes to Phi from
- // unreachable blocks.
- if (!InsertPt)
- return nullptr;
-
- auto *DefI = dyn_cast<Instruction>(Def);
- if (!DefI)
- return InsertPt;
-
- assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
-
- auto *L = LI->getLoopFor(DefI->getParent());
- assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
-
- for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
- if (LI->getLoopFor(DTN->getBlock()) == L)
- return DTN->getBlock()->getTerminator();
-
- llvm_unreachable("DefI dominates InsertPt!");
-}
-
//===----------------------------------------------------------------------===//
// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
//===----------------------------------------------------------------------===//
@@ -550,27 +505,11 @@ bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
// IV Widening - Extend the width of an IV to cover its widest uses.
//===----------------------------------------------------------------------===//
-namespace {
-
-// Collect information about induction variables that are used by sign/zero
-// extend operations. This information is recorded by CollectExtend and provides
-// the input to WidenIV.
-struct WideIVInfo {
- PHINode *NarrowIV = nullptr;
-
- // Widest integer type created [sz]ext
- Type *WidestNativeType = nullptr;
-
- // Was a sext user seen before a zext?
- bool IsSigned = false;
-};
-
-} // end anonymous namespace
-
/// Update information about the induction variable that is extended by this
/// sign or zero extend operation. This is used to determine the final width of
/// the IV before actually widening it.
-static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
+ ScalarEvolution *SE,
const TargetTransformInfo *TTI) {
bool IsSigned = Cast->getOpcode() == Instruction::SExt;
if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
@@ -616,982 +555,6 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
}
-namespace {
-
-/// Record a link in the Narrow IV def-use chain along with the WideIV that
-/// computes the same value as the Narrow IV def. This avoids caching Use*
-/// pointers.
-struct NarrowIVDefUse {
- Instruction *NarrowDef = nullptr;
- Instruction *NarrowUse = nullptr;
- Instruction *WideDef = nullptr;
-
- // True if the narrow def is never negative. Tracking this information lets
- // us use a sign extension instead of a zero extension or vice versa, when
- // profitable and legal.
- bool NeverNegative = false;
-
- NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
- bool NeverNegative)
- : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
- NeverNegative(NeverNegative) {}
-};
-
-/// The goal of this transform is to remove sign and zero extends without
-/// creating any new induction variables. To do this, it creates a new phi of
-/// the wider type and redirects all users, either removing extends or inserting
-/// truncs whenever we stop propagating the type.
-class WidenIV {
- // Parameters
- PHINode *OrigPhi;
- Type *WideType;
-
- // Context
- LoopInfo *LI;
- Loop *L;
- ScalarEvolution *SE;
- DominatorTree *DT;
-
- // Does the module have any calls to the llvm.experimental.guard intrinsic
- // at all? If not we can avoid scanning instructions looking for guards.
- bool HasGuards;
-
- // Result
- PHINode *WidePhi = nullptr;
- Instruction *WideInc = nullptr;
- const SCEV *WideIncExpr = nullptr;
- SmallVectorImpl<WeakTrackingVH> &DeadInsts;
-
- SmallPtrSet<Instruction *,16> Widened;
- SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
-
- enum ExtendKind { ZeroExtended, SignExtended, Unknown };
-
- // A map tracking the kind of extension used to widen each narrow IV
- // and narrow IV user.
- // Key: pointer to a narrow IV or IV user.
- // Value: the kind of extension used to widen this Instruction.
- DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
-
- using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>;
-
- // A map with control-dependent ranges for post increment IV uses. The key is
- // a pair of IV def and a use of this def denoting the context. The value is
- // a ConstantRange representing possible values of the def at the given
- // context.
- DenseMap<DefUserPair, ConstantRange> PostIncRangeInfos;
-
- Optional<ConstantRange> getPostIncRangeInfo(Value *Def,
- Instruction *UseI) {
- DefUserPair Key(Def, UseI);
- auto It = PostIncRangeInfos.find(Key);
- return It == PostIncRangeInfos.end()
- ? Optional<ConstantRange>(None)
- : Optional<ConstantRange>(It->second);
- }
-
- void calculatePostIncRanges(PHINode *OrigPhi);
- void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
-
- void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
- DefUserPair Key(Def, UseI);
- auto It = PostIncRangeInfos.find(Key);
- if (It == PostIncRangeInfos.end())
- PostIncRangeInfos.insert({Key, R});
- else
- It->second = R.intersectWith(It->second);
- }
-
-public:
- WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
- DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
- bool HasGuards)
- : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
- L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
- HasGuards(HasGuards), DeadInsts(DI) {
- assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
- ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
- }
-
- PHINode *createWideIV(SCEVExpander &Rewriter);
-
-protected:
- Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
- Instruction *Use);
-
- Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
- Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
- const SCEVAddRecExpr *WideAR);
- Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
-
- ExtendKind getExtendKind(Instruction *I);
-
- using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>;
-
- WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
-
- WidenedRecTy getExtendedOperandRecurrence(NarrowIVDefUse DU);
-
- const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
- unsigned OpCode) const;
-
- Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
-
- bool widenLoopCompare(NarrowIVDefUse DU);
- bool widenWithVariantUse(NarrowIVDefUse DU);
- void widenWithVariantUseCodegen(NarrowIVDefUse DU);
-
- void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
-};
-
-} // end anonymous namespace
-
-Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
- bool IsSigned, Instruction *Use) {
- // Set the debug location and conservative insertion point.
- IRBuilder<> Builder(Use);
- // Hoist the insertion point into loop preheaders as far as possible.
- for (const Loop *L = LI->getLoopFor(Use->getParent());
- L && L->getLoopPreheader() && L->isLoopInvariant(NarrowOper);
- L = L->getParentLoop())
- Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
-
- return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
- Builder.CreateZExt(NarrowOper, WideType);
-}
-
-/// Instantiate a wide operation to replace a narrow operation. This only needs
-/// to handle operations that can evaluation to SCEVAddRec. It can safely return
-/// 0 for any operation we decide not to clone.
-Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
- const SCEVAddRecExpr *WideAR) {
- unsigned Opcode = DU.NarrowUse->getOpcode();
- switch (Opcode) {
- default:
- return nullptr;
- case Instruction::Add:
- case Instruction::Mul:
- case Instruction::UDiv:
- case Instruction::Sub:
- return cloneArithmeticIVUser(DU, WideAR);
-
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- return cloneBitwiseIVUser(DU);
- }
-}
-
-Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
- Instruction *NarrowUse = DU.NarrowUse;
- Instruction *NarrowDef = DU.NarrowDef;
- Instruction *WideDef = DU.WideDef;
-
- LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
-
- // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
- // about the narrow operand yet so must insert a [sz]ext. It is probably loop
- // invariant and will be folded or hoisted. If it actually comes from a
- // widened IV, it should be removed during a future call to widenIVUse.
- bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
- Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
- ? WideDef
- : createExtendInst(NarrowUse->getOperand(0), WideType,
- IsSigned, NarrowUse);
- Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
- ? WideDef
- : createExtendInst(NarrowUse->getOperand(1), WideType,
- IsSigned, NarrowUse);
-
- auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
- auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
- NarrowBO->getName());
- IRBuilder<> Builder(NarrowUse);
- Builder.Insert(WideBO);
- WideBO->copyIRFlags(NarrowBO);
- return WideBO;
-}
-
-Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
- const SCEVAddRecExpr *WideAR) {
- Instruction *NarrowUse = DU.NarrowUse;
- Instruction *NarrowDef = DU.NarrowDef;
- Instruction *WideDef = DU.WideDef;
-
- LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
-
- unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
-
- // We're trying to find X such that
- //
- // Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
- //
- // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
- // and check using SCEV if any of them are correct.
-
- // Returns true if extending NonIVNarrowDef according to `SignExt` is a
- // correct solution to X.
- auto GuessNonIVOperand = [&](bool SignExt) {
- const SCEV *WideLHS;
- const SCEV *WideRHS;
-
- auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
- if (SignExt)
- return SE->getSignExtendExpr(S, Ty);
- return SE->getZeroExtendExpr(S, Ty);
- };
-
- if (IVOpIdx == 0) {
- WideLHS = SE->getSCEV(WideDef);
- const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
- WideRHS = GetExtend(NarrowRHS, WideType);
- } else {
- const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
- WideLHS = GetExtend(NarrowLHS, WideType);
- WideRHS = SE->getSCEV(WideDef);
- }
-
- // WideUse is "WideDef `op.wide` X" as described in the comment.
- const SCEV *WideUse = nullptr;
-
- switch (NarrowUse->getOpcode()) {
- default:
- llvm_unreachable("No other possibility!");
-
- case Instruction::Add:
- WideUse = SE->getAddExpr(WideLHS, WideRHS);
- break;
-
- case Instruction::Mul:
- WideUse = SE->getMulExpr(WideLHS, WideRHS);
- break;
-
- case Instruction::UDiv:
- WideUse = SE->getUDivExpr(WideLHS, WideRHS);
- break;
-
- case Instruction::Sub:
- WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
- break;
- }
-
- return WideUse == WideAR;
- };
-
- bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
- if (!GuessNonIVOperand(SignExtend)) {
- SignExtend = !SignExtend;
- if (!GuessNonIVOperand(SignExtend))
- return nullptr;
- }
-
- Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
- ? WideDef
- : createExtendInst(NarrowUse->getOperand(0), WideType,
- SignExtend, NarrowUse);
- Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
- ? WideDef
- : createExtendInst(NarrowUse->getOperand(1), WideType,
- SignExtend, NarrowUse);
-
- auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
- auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
- NarrowBO->getName());
-
- IRBuilder<> Builder(NarrowUse);
- Builder.Insert(WideBO);
- WideBO->copyIRFlags(NarrowBO);
- return WideBO;
-}
-
-WidenIV::ExtendKind WidenIV::getExtendKind(Instruction *I) {
- auto It = ExtendKindMap.find(I);
- assert(It != ExtendKindMap.end() && "Instruction not yet extended!");
- return It->second;
-}
-
-const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
- unsigned OpCode) const {
- if (OpCode == Instruction::Add)
- return SE->getAddExpr(LHS, RHS);
- if (OpCode == Instruction::Sub)
- return SE->getMinusSCEV(LHS, RHS);
- if (OpCode == Instruction::Mul)
- return SE->getMulExpr(LHS, RHS);
-
- llvm_unreachable("Unsupported opcode.");
-}
-
-/// No-wrap operations can transfer sign extension of their result to their
-/// operands. Generate the SCEV value for the widened operation without
-/// actually modifying the IR yet. If the expression after extending the
-/// operands is an AddRec for this loop, return the AddRec and the kind of
-/// extension used.
-WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
- // Handle the common case of add<nsw/nuw>
- const unsigned OpCode = DU.NarrowUse->getOpcode();
- // Only Add/Sub/Mul instructions supported yet.
- if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
- OpCode != Instruction::Mul)
- return {nullptr, Unknown};
-
- // One operand (NarrowDef) has already been extended to WideDef. Now determine
- // if extending the other will lead to a recurrence.
- const unsigned ExtendOperIdx =
- DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
- assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
-
- const SCEV *ExtendOperExpr = nullptr;
- const OverflowingBinaryOperator *OBO =
- cast<OverflowingBinaryOperator>(DU.NarrowUse);
- ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
- if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
- ExtendOperExpr = SE->getSignExtendExpr(
- SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
- else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
- ExtendOperExpr = SE->getZeroExtendExpr(
- SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
- else
- return {nullptr, Unknown};
-
- // When creating this SCEV expr, don't apply the current operations NSW or NUW
- // flags. This instruction may be guarded by control flow that the no-wrap
- // behavior depends on. Non-control-equivalent instructions can be mapped to
- // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
- // semantics to those operations.
- const SCEV *lhs = SE->getSCEV(DU.WideDef);
- const SCEV *rhs = ExtendOperExpr;
-
- // Let's swap operands to the initial order for the case of non-commutative
- // operations, like SUB. See PR21014.
- if (ExtendOperIdx == 0)
- std::swap(lhs, rhs);
- const SCEVAddRecExpr *AddRec =
- dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
-
- if (!AddRec || AddRec->getLoop() != L)
- return {nullptr, Unknown};
-
- return {AddRec, ExtKind};
-}
-
-/// Is this instruction potentially interesting for further simplification after
-/// widening it's type? In other words, can the extend be safely hoisted out of
-/// the loop with SCEV reducing the value to a recurrence on the same loop. If
-/// so, return the extended recurrence and the kind of extension used. Otherwise
-/// return {nullptr, Unknown}.
-WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
- if (!SE->isSCEVable(DU.NarrowUse->getType()))
- return {nullptr, Unknown};
-
- const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
- if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
- SE->getTypeSizeInBits(WideType)) {
- // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
- // index. So don't follow this use.
- return {nullptr, Unknown};
- }
-
- const SCEV *WideExpr;
- ExtendKind ExtKind;
- if (DU.NeverNegative) {
- WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
- if (isa<SCEVAddRecExpr>(WideExpr))
- ExtKind = SignExtended;
- else {
- WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
- ExtKind = ZeroExtended;
- }
- } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
- WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
- ExtKind = SignExtended;
- } else {
- WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
- ExtKind = ZeroExtended;
- }
- const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
- if (!AddRec || AddRec->getLoop() != L)
- return {nullptr, Unknown};
- return {AddRec, ExtKind};
-}
-
-/// This IV user cannot be widened. Replace this use of the original narrow IV
-/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
-static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
- auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
- if (!InsertPt)
- return;
- LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
- << *DU.NarrowUse << "\n");
- IRBuilder<> Builder(InsertPt);
- Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
- DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
-}
-
-/// If the narrow use is a compare instruction, then widen the compare
-// (and possibly the other operand). The extend operation is hoisted into the
-// loop preheader as far as possible.
-bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
- ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
- if (!Cmp)
- return false;
-
- // We can legally widen the comparison in the following two cases:
- //
- // - The signedness of the IV extension and comparison match
- //
- // - The narrow IV is always positive (and thus its sign extension is equal
- // to its zero extension). For instance, let's say we're zero extending
- // %narrow for the following use
- //
- // icmp slt i32 %narrow, %val ... (A)
- //
- // and %narrow is always positive. Then
- //
- // (A) == icmp slt i32 sext(%narrow), sext(%val)
- // == icmp slt i32 zext(%narrow), sext(%val)
- bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
- if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
- return false;
-
- Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
- unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
- unsigned IVWidth = SE->getTypeSizeInBits(WideType);
- assert(CastWidth <= IVWidth && "Unexpected width while widening compare.");
-
- // Widen the compare instruction.
- auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
- if (!InsertPt)
- return false;
- IRBuilder<> Builder(InsertPt);
- DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
-
- // Widen the other operand of the compare, if necessary.
- if (CastWidth < IVWidth) {
- Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
- DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
- }
- return true;
-}
-
-// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this
-// will not work when:
-// 1) SCEV traces back to an instruction inside the loop that SCEV can not
-// expand, eg. add %indvar, (load %addr)
-// 2) SCEV finds a loop variant, eg. add %indvar, %loopvariant
-// While SCEV fails to avoid trunc, we can still try to use instruction
-// combining approach to prove trunc is not required. This can be further
-// extended with other instruction combining checks, but for now we handle the
-// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext")
-//
-// Src:
-// %c = sub nsw %b, %indvar
-// %d = sext %c to i64
-// Dst:
-// %indvar.ext1 = sext %indvar to i64
-// %m = sext %b to i64
-// %d = sub nsw i64 %m, %indvar.ext1
-// Therefore, as long as the result of add/sub/mul is extended to wide type, no
-// trunc is required regardless of how %b is generated. This pattern is common
-// when calculating address in 64 bit architecture
-bool WidenIV::widenWithVariantUse(NarrowIVDefUse DU) {
- Instruction *NarrowUse = DU.NarrowUse;
- Instruction *NarrowDef = DU.NarrowDef;
- Instruction *WideDef = DU.WideDef;
-
- // Handle the common case of add<nsw/nuw>
- const unsigned OpCode = NarrowUse->getOpcode();
- // Only Add/Sub/Mul instructions are supported.
- if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
- OpCode != Instruction::Mul)
- return false;
-
- // The operand that is not defined by NarrowDef of DU. Let's call it the
- // other operand.
- unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == NarrowDef ? 1 : 0;
- assert(DU.NarrowUse->getOperand(1 - ExtendOperIdx) == DU.NarrowDef &&
- "bad DU");
-
- const SCEV *ExtendOperExpr = nullptr;
- const OverflowingBinaryOperator *OBO =
- cast<OverflowingBinaryOperator>(NarrowUse);
- ExtendKind ExtKind = getExtendKind(NarrowDef);
- if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
- ExtendOperExpr = SE->getSignExtendExpr(
- SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
- else if (ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
- ExtendOperExpr = SE->getZeroExtendExpr(
- SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
- else
- return false;
-
- // Verifying that Defining operand is an AddRec
- const SCEV *Op1 = SE->getSCEV(WideDef);
- const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
- if (!AddRecOp1 || AddRecOp1->getLoop() != L)
- return false;
- // Verifying that other operand is an Extend.
- if (ExtKind == SignExtended) {
- if (!isa<SCEVSignExtendExpr>(ExtendOperExpr))
- return false;
- } else {
- if (!isa<SCEVZeroExtendExpr>(ExtendOperExpr))
- return false;
- }
-
- if (ExtKind == SignExtended) {
- for (Use &U : NarrowUse->uses()) {
- SExtInst *User = dyn_cast<SExtInst>(U.getUser());
- if (!User || User->getType() != WideType)
- return false;
- }
- } else { // ExtKind == ZeroExtended
- for (Use &U : NarrowUse->uses()) {
- ZExtInst *User = dyn_cast<ZExtInst>(U.getUser());
- if (!User || User->getType() != WideType)
- return false;
- }
- }
-
- return true;
-}
-
-/// Special Case for widening with loop variant (see
-/// WidenIV::widenWithVariant). This is the code generation part.
-void WidenIV::widenWithVariantUseCodegen(NarrowIVDefUse DU) {
- Instruction *NarrowUse = DU.NarrowUse;
- Instruction *NarrowDef = DU.NarrowDef;
- Instruction *WideDef = DU.WideDef;
-
- ExtendKind ExtKind = getExtendKind(NarrowDef);
-
- LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
-
- // Generating a widening use instruction.
- Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
- ? WideDef
- : createExtendInst(NarrowUse->getOperand(0), WideType,
- ExtKind, NarrowUse);
- Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
- ? WideDef
- : createExtendInst(NarrowUse->getOperand(1), WideType,
- ExtKind, NarrowUse);
-
- auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
- auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
- NarrowBO->getName());
- IRBuilder<> Builder(NarrowUse);
- Builder.Insert(WideBO);
- WideBO->copyIRFlags(NarrowBO);
-
- assert(ExtKind != Unknown && "Unknown ExtKind not handled");
-
- ExtendKindMap[NarrowUse] = ExtKind;
-
- for (Use &U : NarrowUse->uses()) {
- Instruction *User = nullptr;
- if (ExtKind == SignExtended)
- User = dyn_cast<SExtInst>(U.getUser());
- else
- User = dyn_cast<ZExtInst>(U.getUser());
- if (User && User->getType() == WideType) {
- LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
- << *WideBO << "\n");
- ++NumElimExt;
- User->replaceAllUsesWith(WideBO);
- DeadInsts.emplace_back(User);
- }
- }
-}
-
-/// Determine whether an individual user of the narrow IV can be widened. If so,
-/// return the wide clone of the user.
-Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
- assert(ExtendKindMap.count(DU.NarrowDef) &&
- "Should already know the kind of extension used to widen NarrowDef");
-
- // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
- if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
- if (LI->getLoopFor(UsePhi->getParent()) != L) {
- // For LCSSA phis, sink the truncate outside the loop.
- // After SimplifyCFG most loop exit targets have a single predecessor.
- // Otherwise fall back to a truncate within the loop.
- if (UsePhi->getNumOperands() != 1)
- truncateIVUse(DU, DT, LI);
- else {
- // Widening the PHI requires us to insert a trunc. The logical place
- // for this trunc is in the same BB as the PHI. This is not possible if
- // the BB is terminated by a catchswitch.
- if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator()))
- return nullptr;
-
- PHINode *WidePhi =
- PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
- UsePhi);
- WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
- IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
- Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
- UsePhi->replaceAllUsesWith(Trunc);
- DeadInsts.emplace_back(UsePhi);
- LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
- << *WidePhi << "\n");
- }
- return nullptr;
- }
- }
-
- // This narrow use can be widened by a sext if it's non-negative or its narrow
- // def was widended by a sext. Same for zext.
- auto canWidenBySExt = [&]() {
- return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
- };
- auto canWidenByZExt = [&]() {
- return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
- };
-
- // Our raison d'etre! Eliminate sign and zero extension.
- if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
- (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
- Value *NewDef = DU.WideDef;
- if (DU.NarrowUse->getType() != WideType) {
- unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
- unsigned IVWidth = SE->getTypeSizeInBits(WideType);
- if (CastWidth < IVWidth) {
- // The cast isn't as wide as the IV, so insert a Trunc.
- IRBuilder<> Builder(DU.NarrowUse);
- NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
- }
- else {
- // A wider extend was hidden behind a narrower one. This may induce
- // another round of IV widening in which the intermediate IV becomes
- // dead. It should be very rare.
- LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
- << " not wide enough to subsume " << *DU.NarrowUse
- << "\n");
- DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
- NewDef = DU.NarrowUse;
- }
- }
- if (NewDef != DU.NarrowUse) {
- LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
- << " replaced by " << *DU.WideDef << "\n");
- ++NumElimExt;
- DU.NarrowUse->replaceAllUsesWith(NewDef);
- DeadInsts.emplace_back(DU.NarrowUse);
- }
- // Now that the extend is gone, we want to expose it's uses for potential
- // further simplification. We don't need to directly inform SimplifyIVUsers
- // of the new users, because their parent IV will be processed later as a
- // new loop phi. If we preserved IVUsers analysis, we would also want to
- // push the uses of WideDef here.
-
- // No further widening is needed. The deceased [sz]ext had done it for us.
- return nullptr;
- }
-
- // Does this user itself evaluate to a recurrence after widening?
- WidenedRecTy WideAddRec = getExtendedOperandRecurrence(DU);
- if (!WideAddRec.first)
- WideAddRec = getWideRecurrence(DU);
-
- assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
- if (!WideAddRec.first) {
- // If use is a loop condition, try to promote the condition instead of
- // truncating the IV first.
- if (widenLoopCompare(DU))
- return nullptr;
-
- // We are here about to generate a truncate instruction that may hurt
- // performance because the scalar evolution expression computed earlier
- // in WideAddRec.first does not indicate a polynomial induction expression.
- // In that case, look at the operands of the use instruction to determine
- // if we can still widen the use instead of truncating its operand.
- if (widenWithVariantUse(DU)) {
- widenWithVariantUseCodegen(DU);
- return nullptr;
- }
-
- // This user does not evaluate to a recurrence after widening, so don't
- // follow it. Instead insert a Trunc to kill off the original use,
- // eventually isolating the original narrow IV so it can be removed.
- truncateIVUse(DU, DT, LI);
- return nullptr;
- }
- // Assume block terminators cannot evaluate to a recurrence. We can't to
- // insert a Trunc after a terminator if there happens to be a critical edge.
- assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
- "SCEV is not expected to evaluate a block terminator");
-
- // Reuse the IV increment that SCEVExpander created as long as it dominates
- // NarrowUse.
- Instruction *WideUse = nullptr;
- if (WideAddRec.first == WideIncExpr &&
- Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
- WideUse = WideInc;
- else {
- WideUse = cloneIVUser(DU, WideAddRec.first);
- if (!WideUse)
- return nullptr;
- }
- // Evaluation of WideAddRec ensured that the narrow expression could be
- // extended outside the loop without overflow. This suggests that the wide use
- // evaluates to the same expression as the extended narrow use, but doesn't
- // absolutely guarantee it. Hence the following failsafe check. In rare cases
- // where it fails, we simply throw away the newly created wide use.
- if (WideAddRec.first != SE->getSCEV(WideUse)) {
- LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": "
- << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first
- << "\n");
- DeadInsts.emplace_back(WideUse);
- return nullptr;
- }
-
- // if we reached this point then we are going to replace
- // DU.NarrowUse with WideUse. Reattach DbgValue then.
- replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT);
-
- ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
- // Returning WideUse pushes it on the worklist.
- return WideUse;
-}
-
-/// Add eligible users of NarrowDef to NarrowIVUsers.
-void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
- const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
- bool NonNegativeDef =
- SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
- SE->getConstant(NarrowSCEV->getType(), 0));
- for (User *U : NarrowDef->users()) {
- Instruction *NarrowUser = cast<Instruction>(U);
-
- // Handle data flow merges and bizarre phi cycles.
- if (!Widened.insert(NarrowUser).second)
- continue;
-
- bool NonNegativeUse = false;
- if (!NonNegativeDef) {
- // We might have a control-dependent range information for this context.
- if (auto RangeInfo = getPostIncRangeInfo(NarrowDef, NarrowUser))
- NonNegativeUse = RangeInfo->getSignedMin().isNonNegative();
- }
-
- NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef,
- NonNegativeDef || NonNegativeUse);
- }
-}
-
-/// Process a single induction variable. First use the SCEVExpander to create a
-/// wide induction variable that evaluates to the same recurrence as the
-/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
-/// def-use chain. After widenIVUse has processed all interesting IV users, the
-/// narrow IV will be isolated for removal by DeleteDeadPHIs.
-///
-/// It would be simpler to delete uses as they are processed, but we must avoid
-/// invalidating SCEV expressions.
-PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
- // Is this phi an induction variable?
- const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
- if (!AddRec)
- return nullptr;
-
- // Widen the induction variable expression.
- const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
- ? SE->getSignExtendExpr(AddRec, WideType)
- : SE->getZeroExtendExpr(AddRec, WideType);
-
- assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
- "Expect the new IV expression to preserve its type");
-
- // Can the IV be extended outside the loop without overflow?
- AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
- if (!AddRec || AddRec->getLoop() != L)
- return nullptr;
-
- // An AddRec must have loop-invariant operands. Since this AddRec is
- // materialized by a loop header phi, the expression cannot have any post-loop
- // operands, so they must dominate the loop header.
- assert(
- SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
- SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
- "Loop header phi recurrence inputs do not dominate the loop");
-
- // Iterate over IV uses (including transitive ones) looking for IV increments
- // of the form 'add nsw %iv, <const>'. For each increment and each use of
- // the increment calculate control-dependent range information basing on
- // dominating conditions inside of the loop (e.g. a range check inside of the
- // loop). Calculated ranges are stored in PostIncRangeInfos map.
- //
- // Control-dependent range information is later used to prove that a narrow
- // definition is not negative (see pushNarrowIVUsers). It's difficult to do
- // this on demand because when pushNarrowIVUsers needs this information some
- // of the dominating conditions might be already widened.
- if (UsePostIncrementRanges)
- calculatePostIncRanges(OrigPhi);
-
- // The rewriter provides a value for the desired IV expression. This may
- // either find an existing phi or materialize a new one. Either way, we
- // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
- // of the phi-SCC dominates the loop entry.
- Instruction *InsertPt = &L->getHeader()->front();
- WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
-
- // Remembering the WideIV increment generated by SCEVExpander allows
- // widenIVUse to reuse it when widening the narrow IV's increment. We don't
- // employ a general reuse mechanism because the call above is the only call to
- // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
- if (BasicBlock *LatchBlock = L->getLoopLatch()) {
- WideInc =
- cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
- WideIncExpr = SE->getSCEV(WideInc);
- // Propagate the debug location associated with the original loop increment
- // to the new (widened) increment.
- auto *OrigInc =
- cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
- WideInc->setDebugLoc(OrigInc->getDebugLoc());
- }
-
- LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
- ++NumWidened;
-
- // Traverse the def-use chain using a worklist starting at the original IV.
- assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
-
- Widened.insert(OrigPhi);
- pushNarrowIVUsers(OrigPhi, WidePhi);
-
- while (!NarrowIVUsers.empty()) {
- NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
-
- // Process a def-use edge. This may replace the use, so don't hold a
- // use_iterator across it.
- Instruction *WideUse = widenIVUse(DU, Rewriter);
-
- // Follow all def-use edges from the previous narrow use.
- if (WideUse)
- pushNarrowIVUsers(DU.NarrowUse, WideUse);
-
- // widenIVUse may have removed the def-use edge.
- if (DU.NarrowDef->use_empty())
- DeadInsts.emplace_back(DU.NarrowDef);
- }
-
- // Attach any debug information to the new PHI.
- replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT);
-
- return WidePhi;
-}
-
-/// Calculates control-dependent range for the given def at the given context
-/// by looking at dominating conditions inside of the loop
-void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
- Instruction *NarrowUser) {
- using namespace llvm::PatternMatch;
-
- Value *NarrowDefLHS;
- const APInt *NarrowDefRHS;
- if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
- m_APInt(NarrowDefRHS))) ||
- !NarrowDefRHS->isNonNegative())
- return;
-
- auto UpdateRangeFromCondition = [&] (Value *Condition,
- bool TrueDest) {
- CmpInst::Predicate Pred;
- Value *CmpRHS;
- if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
- m_Value(CmpRHS))))
- return;
-
- CmpInst::Predicate P =
- TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
-
- auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
- auto CmpConstrainedLHSRange =
- ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
- auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap(
- *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap);
-
- updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
- };
-
- auto UpdateRangeFromGuards = [&](Instruction *Ctx) {
- if (!HasGuards)
- return;
-
- for (Instruction &I : make_range(Ctx->getIterator().getReverse(),
- Ctx->getParent()->rend())) {
- Value *C = nullptr;
- if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(C))))
- UpdateRangeFromCondition(C, /*TrueDest=*/true);
- }
- };
-
- UpdateRangeFromGuards(NarrowUser);
-
- BasicBlock *NarrowUserBB = NarrowUser->getParent();
- // If NarrowUserBB is statically unreachable asking dominator queries may
- // yield surprising results. (e.g. the block may not have a dom tree node)
- if (!DT->isReachableFromEntry(NarrowUserBB))
- return;
-
- for (auto *DTB = (*DT)[NarrowUserBB]->getIDom();
- L->contains(DTB->getBlock());
- DTB = DTB->getIDom()) {
- auto *BB = DTB->getBlock();
- auto *TI = BB->getTerminator();
- UpdateRangeFromGuards(TI);
-
- auto *BI = dyn_cast<BranchInst>(TI);
- if (!BI || !BI->isConditional())
- continue;
-
- auto *TrueSuccessor = BI->getSuccessor(0);
- auto *FalseSuccessor = BI->getSuccessor(1);
-
- auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
- return BBE.isSingleEdge() &&
- DT->dominates(BBE, NarrowUser->getParent());
- };
-
- if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
- UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
-
- if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
- UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
- }
-}
-
-/// Calculates PostIncRangeInfos map for the given IV
-void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
- SmallPtrSet<Instruction *, 16> Visited;
- SmallVector<Instruction *, 6> Worklist;
- Worklist.push_back(OrigPhi);
- Visited.insert(OrigPhi);
-
- while (!Worklist.empty()) {
- Instruction *NarrowDef = Worklist.pop_back_val();
-
- for (Use &U : NarrowDef->uses()) {
- auto *NarrowUser = cast<Instruction>(U.getUser());
-
- // Don't go looking outside the current loop.
- auto *NarrowUserLoop = (*LI)[NarrowUser->getParent()];
- if (!NarrowUserLoop || !L->contains(NarrowUserLoop))
- continue;
-
- if (!Visited.insert(NarrowUser).second)
- continue;
-
- Worklist.push_back(NarrowUser);
-
- calculatePostIncRange(NarrowDef, NarrowUser);
- }
- }
-}
-
//===----------------------------------------------------------------------===//
// Live IV Reduction - Minimize IVs live across the loop.
//===----------------------------------------------------------------------===//
@@ -1668,9 +631,18 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
}
} while(!LoopPhis.empty());
+ // Continue if we disallowed widening.
+ if (!WidenIndVars)
+ continue;
+
for (; !WideIVs.empty(); WideIVs.pop_back()) {
- WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts, HasGuards);
- if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
+ unsigned ElimExt;
+ unsigned Widened;
+ if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter,
+ DT, DeadInsts, ElimExt, Widened,
+ HasGuards, UsePostIncrementRanges)) {
+ NumElimExt += ElimExt;
+ NumWidened += Widened;
Changed = true;
LoopPhis.push_back(WidePhi);
}
@@ -1813,7 +785,7 @@ static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
// If we can't analyze propagation through this instruction, just skip it
// and transitive users. Safe as false is a conservative result.
- if (!propagatesPoison(I) && I != Root)
+ if (!propagatesPoison(cast<Operator>(I)) && I != Root)
continue;
if (KnownPoison.insert(I).second)
@@ -2318,42 +1290,116 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
return MadeAnyChanges;
}
-/// Return a symbolic upper bound for the backedge taken count of the loop.
-/// This is more general than getConstantMaxBackedgeTakenCount as it returns
-/// an arbitrary expression as opposed to only constants.
-/// TODO: Move into the ScalarEvolution class.
-static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE,
- DominatorTree &DT, Loop *L) {
- SmallVector<BasicBlock*, 16> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
+static void replaceExitCond(BranchInst *BI, Value *NewCond,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ auto *OldCond = BI->getCondition();
+ BI->setCondition(NewCond);
+ if (OldCond->use_empty())
+ DeadInsts.emplace_back(OldCond);
+}
- // Form an expression for the maximum exit count possible for this loop. We
- // merge the max and exact information to approximate a version of
- // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
- SmallVector<const SCEV*, 4> ExitCounts;
- for (BasicBlock *ExitingBB : ExitingBlocks) {
- const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
- if (isa<SCEVCouldNotCompute>(ExitCount))
- ExitCount = SE.getExitCount(L, ExitingBB,
- ScalarEvolution::ConstantMaximum);
- if (!isa<SCEVCouldNotCompute>(ExitCount)) {
- assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
- "We should only have known counts for exiting blocks that "
- "dominate latch!");
- ExitCounts.push_back(ExitCount);
- }
+static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+ auto *OldCond = BI->getCondition();
+ auto *NewCond =
+ ConstantInt::get(OldCond->getType(), IsTaken ? ExitIfTrue : !ExitIfTrue);
+ replaceExitCond(BI, NewCond, DeadInsts);
+}
+
+static void replaceWithInvariantCond(
+ const Loop *L, BasicBlock *ExitingBB, ICmpInst::Predicate InvariantPred,
+ const SCEV *InvariantLHS, const SCEV *InvariantRHS, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ Rewriter.setInsertPoint(BI);
+ auto *LHSV = Rewriter.expandCodeFor(InvariantLHS);
+ auto *RHSV = Rewriter.expandCodeFor(InvariantRHS);
+ bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+ if (ExitIfTrue)
+ InvariantPred = ICmpInst::getInversePredicate(InvariantPred);
+ IRBuilder<> Builder(BI);
+ auto *NewCond = Builder.CreateICmp(InvariantPred, LHSV, RHSV,
+ BI->getCondition()->getName());
+ replaceExitCond(BI, NewCond, DeadInsts);
+}
+
+static bool optimizeLoopExitWithUnknownExitCount(
+ const Loop *L, BranchInst *BI, BasicBlock *ExitingBB,
+ const SCEV *MaxIter, bool Inverted, bool SkipLastIter,
+ ScalarEvolution *SE, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ ICmpInst::Predicate Pred;
+ Value *LHS, *RHS;
+ using namespace PatternMatch;
+ BasicBlock *TrueSucc, *FalseSucc;
+ if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+ m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc))))
+ return false;
+
+ assert((L->contains(TrueSucc) != L->contains(FalseSucc)) &&
+ "Not a loop exit!");
+
+ // 'LHS pred RHS' should now mean that we stay in loop.
+ if (L->contains(FalseSucc))
+ Pred = CmpInst::getInversePredicate(Pred);
+
+ // If we are proving loop exit, invert the predicate.
+ if (Inverted)
+ Pred = CmpInst::getInversePredicate(Pred);
+
+ const SCEV *LHSS = SE->getSCEVAtScope(LHS, L);
+ const SCEV *RHSS = SE->getSCEVAtScope(RHS, L);
+ // Can we prove it to be trivially true?
+ if (SE->isKnownPredicateAt(Pred, LHSS, RHSS, BI)) {
+ foldExit(L, ExitingBB, Inverted, DeadInsts);
+ return true;
+ }
+ // Further logic works for non-inverted condition only.
+ if (Inverted)
+ return false;
+
+ auto *ARTy = LHSS->getType();
+ auto *MaxIterTy = MaxIter->getType();
+ // If possible, adjust types.
+ if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy))
+ MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy);
+ else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) {
+ const SCEV *MinusOne = SE->getMinusOne(ARTy);
+ auto *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy);
+ if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI))
+ MaxIter = SE->getTruncateExpr(MaxIter, ARTy);
+ }
+
+ if (SkipLastIter) {
+ const SCEV *One = SE->getOne(MaxIter->getType());
+ MaxIter = SE->getMinusSCEV(MaxIter, One);
}
- if (ExitCounts.empty())
- return SE.getCouldNotCompute();
- return SE.getUMinFromMismatchedTypes(ExitCounts);
+
+ // Check if there is a loop-invariant predicate equivalent to our check.
+ auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS,
+ L, BI, MaxIter);
+ if (!LIP)
+ return false;
+
+ // Can we prove it to be trivially true?
+ if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI))
+ foldExit(L, ExitingBB, Inverted, DeadInsts);
+ else
+ replaceWithInvariantCond(L, ExitingBB, LIP->Pred, LIP->LHS, LIP->RHS,
+ Rewriter, DeadInsts);
+
+ return true;
}
bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
SmallVector<BasicBlock*, 16> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
- // Remove all exits which aren't both rewriteable and analyzeable.
- auto NewEnd = llvm::remove_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+ // Remove all exits which aren't both rewriteable and execute on every
+ // iteration.
+ llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
// If our exitting block exits multiple loops, we can only rewrite the
// innermost one. Otherwise, we're changing how many times the innermost
// loop runs before it exits.
@@ -2369,56 +1415,85 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
if (isa<Constant>(BI->getCondition()))
return true;
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
- if (isa<SCEVCouldNotCompute>(ExitCount))
+ // Likewise, the loop latch must be dominated by the exiting BB.
+ if (!DT->dominates(ExitingBB, L->getLoopLatch()))
return true;
+
return false;
});
- ExitingBlocks.erase(NewEnd, ExitingBlocks.end());
if (ExitingBlocks.empty())
return false;
// Get a symbolic upper bound on the loop backedge taken count.
- const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L);
+ const SCEV *MaxExitCount = SE->getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(MaxExitCount))
return false;
- // Visit our exit blocks in order of dominance. We know from the fact that
- // all exits (left) are analyzeable that the must be a total dominance order
- // between them as each must dominate the latch. The visit order only
- // matters for the provably equal case.
- llvm::sort(ExitingBlocks,
- [&](BasicBlock *A, BasicBlock *B) {
+ // Visit our exit blocks in order of dominance. We know from the fact that
+ // all exits must dominate the latch, so there is a total dominance order
+ // between them.
+ llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) {
// std::sort sorts in ascending order, so we want the inverse of
// the normal dominance relation.
if (A == B) return false;
- if (DT->properlyDominates(A, B)) return true;
- if (DT->properlyDominates(B, A)) return false;
- llvm_unreachable("expected total dominance order!");
- });
+ if (DT->properlyDominates(A, B))
+ return true;
+ else {
+ assert(DT->properlyDominates(B, A) &&
+ "expected total dominance order!");
+ return false;
+ }
+ });
#ifdef ASSERT
for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
}
#endif
- auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) {
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
- bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
- auto *OldCond = BI->getCondition();
- auto *NewCond = ConstantInt::get(OldCond->getType(),
- IsTaken ? ExitIfTrue : !ExitIfTrue);
- BI->setCondition(NewCond);
- if (OldCond->use_empty())
- DeadInsts.emplace_back(OldCond);
- };
-
bool Changed = false;
+ bool SkipLastIter = false;
SmallSet<const SCEV*, 8> DominatingExitCounts;
for (BasicBlock *ExitingBB : ExitingBlocks) {
const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
- assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above");
+ if (isa<SCEVCouldNotCompute>(ExitCount)) {
+ // Okay, we do not know the exit count here. Can we at least prove that it
+ // will remain the same within iteration space?
+ auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ auto OptimizeCond = [&](bool Inverted, bool SkipLastIter) {
+ return optimizeLoopExitWithUnknownExitCount(
+ L, BI, ExitingBB, MaxExitCount, Inverted, SkipLastIter, SE,
+ Rewriter, DeadInsts);
+ };
+
+ // TODO: We might have proved that we can skip the last iteration for
+ // this check. In this case, we only want to check the condition on the
+ // pre-last iteration (MaxExitCount - 1). However, there is a nasty
+ // corner case:
+ //
+ // for (i = len; i != 0; i--) { ... check (i ult X) ... }
+ //
+ // If we could not prove that len != 0, then we also could not prove that
+ // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then
+ // OptimizeCond will likely not prove anything for it, even if it could
+ // prove the same fact for len.
+ //
+ // As a temporary solution, we query both last and pre-last iterations in
+ // hope that we will be able to prove triviality for at least one of
+ // them. We can stop querying MaxExitCount for this case once SCEV
+ // understands that (MaxExitCount - 1) will not overflow here.
+ if (OptimizeCond(false, false) || OptimizeCond(true, false))
+ Changed = true;
+ else if (SkipLastIter)
+ if (OptimizeCond(false, true) || OptimizeCond(true, true))
+ Changed = true;
+ continue;
+ }
+
+ if (MaxExitCount == ExitCount)
+ // If the loop has more than 1 iteration, all further checks will be
+ // executed 1 iteration less.
+ SkipLastIter = true;
// If we know we'd exit on the first iteration, rewrite the exit to
// reflect this. This does not imply the loop must exit through this
@@ -2426,7 +1501,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
// TODO: Given we know the backedge can't be taken, we should go ahead
// and break it. Or at least, kill all the header phis and simplify.
if (ExitCount->isZero()) {
- FoldExit(ExitingBB, true);
+ foldExit(L, ExitingBB, true, DeadInsts);
Changed = true;
continue;
}
@@ -2448,7 +1523,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
// one?
if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
MaxExitCount, ExitCount)) {
- FoldExit(ExitingBB, false);
+ foldExit(L, ExitingBB, false, DeadInsts);
Changed = true;
continue;
}
@@ -2458,7 +1533,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
// exiting iteration, but (from the visit order) strictly follows another
// which does the same and is thus dead.
if (!DominatingExitCounts.insert(ExitCount).second) {
- FoldExit(ExitingBB, false);
+ foldExit(L, ExitingBB, false, DeadInsts);
Changed = true;
continue;
}
@@ -2714,7 +1789,9 @@ bool IndVarSimplify::run(Loop *L) {
if (optimizeLoopExits(L, Rewriter)) {
Changed = true;
// Given we've changed exit counts, notify SCEV
- SE->forgetLoop(L);
+ // Some nested loops may share same folded exit basic block,
+ // thus we need to notify top most loop.
+ SE->forgetTopmostLoop(L);
}
// Try to form loop invariant tests for loop exits by changing how many
@@ -2791,11 +1868,15 @@ bool IndVarSimplify::run(Loop *L) {
// Now that we're done iterating through lists, clean up any instructions
// which are now dead.
- while (!DeadInsts.empty())
- if (Instruction *Inst =
- dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
+ while (!DeadInsts.empty()) {
+ Value *V = DeadInsts.pop_back_val();
+
+ if (PHINode *PHI = dyn_cast_or_null<PHINode>(V))
+ Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get());
+ else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
Changed |=
RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get());
+ }
// The Rewriter may not be used from this point on.
@@ -2845,7 +1926,8 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
Function *F = L.getHeader()->getParent();
const DataLayout &DL = F->getParent()->getDataLayout();
- IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA);
+ IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA,
+ WidenIndVars && AllowIVWidening);
if (!IVS.run(&L))
return PreservedAnalyses::all();
@@ -2882,7 +1964,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
if (MSSAAnalysis)
MSSA = &MSSAAnalysis->getMSSA();
- IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA);
+ IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA, AllowIVWidening);
return IVS.run(L);
}
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 30e4822b6769..6e09dec198c2 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -52,6 +52,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -109,12 +110,12 @@ static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
cl::init(false));
-static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
- cl::Hidden, cl::init(10));
-
static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
cl::Hidden, cl::init(false));
+static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations",
+ cl::Hidden, cl::init(10));
+
static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
cl::Hidden, cl::init(true));
@@ -145,7 +146,6 @@ class InductiveRangeCheck {
const SCEV *Step = nullptr;
const SCEV *End = nullptr;
Use *CheckUse = nullptr;
- bool IsSigned = true;
static bool parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE,
Value *&Index, Value *&Length,
@@ -160,7 +160,6 @@ public:
const SCEV *getBegin() const { return Begin; }
const SCEV *getStep() const { return Step; }
const SCEV *getEnd() const { return End; }
- bool isSigned() const { return IsSigned; }
void print(raw_ostream &OS) const {
OS << "InductiveRangeCheck:\n";
@@ -229,17 +228,27 @@ public:
SmallVectorImpl<InductiveRangeCheck> &Checks);
};
+struct LoopStructure;
+
class InductiveRangeCheckElimination {
ScalarEvolution &SE;
BranchProbabilityInfo *BPI;
DominatorTree &DT;
LoopInfo &LI;
+ using GetBFIFunc =
+ llvm::Optional<llvm::function_ref<llvm::BlockFrequencyInfo &()> >;
+ GetBFIFunc GetBFI;
+
+ // Returns true if it is profitable to do a transform basing on estimation of
+ // number of iterations.
+ bool isProfitableToTransform(const Loop &L, LoopStructure &LS);
+
public:
InductiveRangeCheckElimination(ScalarEvolution &SE,
BranchProbabilityInfo *BPI, DominatorTree &DT,
- LoopInfo &LI)
- : SE(SE), BPI(BPI), DT(DT), LI(LI) {}
+ LoopInfo &LI, GetBFIFunc GetBFI = None)
+ : SE(SE), BPI(BPI), DT(DT), LI(LI), GetBFI(GetBFI) {}
bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
};
@@ -394,7 +403,6 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
IRC.Begin = IndexAddRec->getStart();
IRC.Step = IndexAddRec->getStepRecurrence(SE);
IRC.CheckUse = &ConditionUse;
- IRC.IsSigned = IsSigned;
Checks.push_back(IRC);
}
@@ -497,9 +505,8 @@ struct LoopStructure {
return Result;
}
- static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
- BranchProbabilityInfo *BPI,
- Loop &, const char *&);
+ static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, Loop &,
+ const char *&);
};
/// This class is used to constrain loops to run within a given iteration space.
@@ -743,8 +750,7 @@ static bool isSafeIncreasingBound(const SCEV *Start,
}
Optional<LoopStructure>
-LoopStructure::parseLoopStructure(ScalarEvolution &SE,
- BranchProbabilityInfo *BPI, Loop &L,
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
const char *&FailureReason) {
if (!L.isLoopSimplifyForm()) {
FailureReason = "loop not in LoopSimplify form";
@@ -779,16 +785,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
- BranchProbability ExitProbability =
- BPI ? BPI->getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx)
- : BranchProbability::getZero();
-
- if (!SkipProfitabilityChecks &&
- ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
- FailureReason = "short running loop, not profitable";
- return None;
- }
-
ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
FailureReason = "latch terminator branch not conditional on integral icmp";
@@ -1772,14 +1768,25 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
- InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+ // Get BFI analysis result on demand. Please note that modification of
+ // CFG invalidates this analysis and we should handle it.
+ auto getBFI = [&F, &AM ]()->BlockFrequencyInfo & {
+ return AM.getResult<BlockFrequencyAnalysis>(F);
+ };
+ InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI, { getBFI });
bool Changed = false;
+ {
+ bool CFGChanged = false;
+ for (const auto &L : LI) {
+ CFGChanged |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
+ /*PreserveLCSSA=*/false);
+ Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+ }
+ Changed |= CFGChanged;
- for (const auto &L : LI) {
- Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
- /*PreserveLCSSA=*/false);
- Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+ if (CFGChanged && !SkipProfitabilityChecks)
+ AM.invalidate<BlockFrequencyAnalysis>(F);
}
SmallPriorityWorklist<Loop *, 4> Worklist;
@@ -1791,7 +1798,11 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
while (!Worklist.empty()) {
Loop *L = Worklist.pop_back_val();
- Changed |= IRCE.run(L, LPMAddNewLoop);
+ if (IRCE.run(L, LPMAddNewLoop)) {
+ Changed = true;
+ if (!SkipProfitabilityChecks)
+ AM.invalidate<BlockFrequencyAnalysis>(F);
+ }
}
if (!Changed)
@@ -1832,6 +1843,37 @@ bool IRCELegacyPass::runOnFunction(Function &F) {
return Changed;
}
+bool
+InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
+ LoopStructure &LS) {
+ if (SkipProfitabilityChecks)
+ return true;
+ if (GetBFI.hasValue()) {
+ BlockFrequencyInfo &BFI = (*GetBFI)();
+ uint64_t hFreq = BFI.getBlockFreq(LS.Header).getFrequency();
+ uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency();
+ if (phFreq != 0 && hFreq != 0 && (hFreq / phFreq < MinRuntimeIterations)) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
+ << "the estimated number of iterations basing on "
+ "frequency info is " << (hFreq / phFreq) << "\n";);
+ return false;
+ }
+ return true;
+ }
+
+ if (!BPI)
+ return true;
+ BranchProbability ExitProbability =
+ BPI->getEdgeProbability(LS.Latch, LS.LatchBrExitIdx);
+ if (ExitProbability > BranchProbability(1, MinRuntimeIterations)) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
+ << "the exit probability is too big " << ExitProbability
+ << "\n";);
+ return false;
+ }
+ return true;
+}
+
bool InductiveRangeCheckElimination::run(
Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
if (L->getBlocks().size() >= LoopSizeCutoff) {
@@ -1871,13 +1913,15 @@ bool InductiveRangeCheckElimination::run(
const char *FailureReason = nullptr;
Optional<LoopStructure> MaybeLoopStructure =
- LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+ LoopStructure::parseLoopStructure(SE, *L, FailureReason);
if (!MaybeLoopStructure.hasValue()) {
LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
<< FailureReason << "\n";);
return false;
}
LoopStructure LS = MaybeLoopStructure.getValue();
+ if (!isProfitableToTransform(*L, LS))
+ return false;
const SCEVAddRecExpr *IndVar =
cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index db9cc58bbfc4..332eb10ac16b 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -88,6 +88,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
@@ -108,6 +109,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
@@ -146,13 +148,7 @@ namespace {
using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
-/// InferAddressSpaces
class InferAddressSpaces : public FunctionPass {
- const TargetTransformInfo *TTI = nullptr;
- const DataLayout *DL = nullptr;
-
- /// Target specific address space which uses of should be replaced if
- /// possible.
unsigned FlatAddrSpace = 0;
public:
@@ -168,8 +164,16 @@ public:
}
bool runOnFunction(Function &F) override;
+};
+
+class InferAddressSpacesImpl {
+ const TargetTransformInfo *TTI = nullptr;
+ const DataLayout *DL = nullptr;
+
+ /// Target specific address space which uses of should be replaced if
+ /// possible.
+ unsigned FlatAddrSpace = 0;
-private:
// Returns the new address space of V if updated; otherwise, returns None.
Optional<unsigned>
updateAddressSpace(const Value &V,
@@ -211,6 +215,11 @@ private:
const ValueToValueMapTy &ValueWithNewAddrSpace,
SmallVectorImpl<const Use *> *UndefUsesToFix) const;
unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+
+public:
+ InferAddressSpacesImpl(const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
+ : TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
+ bool run(Function &F);
};
} // end anonymous namespace
@@ -286,7 +295,8 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL,
case Instruction::IntToPtr:
return isNoopPtrIntCastPair(Op, DL, TTI);
default:
- return false;
+ // That value is an address expression if it has an assumed address space.
+ return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
}
}
@@ -325,9 +335,9 @@ getPointerOperands(const Value &V, const DataLayout &DL,
}
}
-bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
- Value *OldV,
- Value *NewV) const {
+bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
+ Value *OldV,
+ Value *NewV) const {
Module *M = II->getParent()->getParent()->getParent();
switch (II->getIntrinsicID()) {
@@ -354,7 +364,7 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
}
}
-void InferAddressSpaces::collectRewritableIntrinsicOperands(
+void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
IntrinsicInst *II, PostorderStackTy &PostorderStack,
DenseSet<Value *> &Visited) const {
auto IID = II->getIntrinsicID();
@@ -379,7 +389,7 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
// Returns all flat address expressions in function F. The elements are
// If V is an unvisited flat address expression, appends V to PostorderStack
// and marks it as visited.
-void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack(
Value *V, PostorderStackTy &PostorderStack,
DenseSet<Value *> &Visited) const {
assert(V->getType()->isPointerTy());
@@ -394,8 +404,8 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
return;
}
- if (isAddressExpression(*V, *DL, TTI) &&
- V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+ if (V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
+ isAddressExpression(*V, *DL, TTI)) {
if (Visited.insert(V).second) {
PostorderStack.emplace_back(V, false);
@@ -413,7 +423,7 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
// Returns all flat address expressions in function F. The elements are ordered
// ordered in postorder.
std::vector<WeakTrackingVH>
-InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
// This function implements a non-recursive postorder traversal of a partial
// use-def graph of function F.
PostorderStackTy PostorderStack;
@@ -478,9 +488,12 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
}
// Otherwise, adds its operands to the stack and explores them.
PostorderStack.back().setInt(true);
- for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
- appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
- Visited);
+ // Skip values with an assumed address space.
+ if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) {
+ for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
+ appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+ Visited);
+ }
}
}
return Postorder;
@@ -520,7 +533,7 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
//
// This may also return nullptr in the case the instruction could not be
// rewritten.
-Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
+Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
Instruction *I, unsigned NewAddrSpace,
const ValueToValueMapTy &ValueWithNewAddrSpace,
SmallVectorImpl<const Use *> *UndefUsesToFix) const {
@@ -555,6 +568,16 @@ Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
return nullptr;
}
+ unsigned AS = TTI->getAssumedAddrSpace(I);
+ if (AS != UninitializedAddressSpace) {
+ // For the assumed address space, insert an `addrspacecast` to make that
+ // explicit.
+ auto *NewPtrTy = I->getType()->getPointerElementType()->getPointerTo(AS);
+ auto *NewI = new AddrSpaceCastInst(I, NewPtrTy);
+ NewI->insertAfter(I);
+ return NewI;
+ }
+
// Computes the converted pointer operands.
SmallVector<Value *, 4> NewPointerOperands;
for (const Use &OperandUse : I->operands()) {
@@ -583,7 +606,7 @@ Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
GEP->getSourceElementType(), NewPointerOperands[0],
- SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+ SmallVector<Value *, 4>(GEP->indices()));
NewGEP->setIsInBounds(GEP->isInBounds());
return NewGEP;
}
@@ -695,13 +718,13 @@ static Value *cloneConstantExprWithNewAddressSpace(
// expression whose address space needs to be modified, in postorder.
//
// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
-Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
- Value *V, unsigned NewAddrSpace,
- const ValueToValueMapTy &ValueWithNewAddrSpace,
- SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
+ Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const {
// All values in Postorder are flat address expressions.
- assert(isAddressExpression(*V, *DL, TTI) &&
- V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+ assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
+ isAddressExpression(*V, *DL, TTI));
if (Instruction *I = dyn_cast<Instruction>(V)) {
Value *NewV = cloneInstructionWithNewAddressSpace(
@@ -721,8 +744,8 @@ Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
// Defines the join operation on the address space lattice (see the file header
// comments).
-unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
- unsigned AS2) const {
+unsigned InferAddressSpacesImpl::joinAddressSpaces(unsigned AS1,
+ unsigned AS2) const {
if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
return FlatAddrSpace;
@@ -735,11 +758,7 @@ unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
return (AS1 == AS2) ? AS1 : FlatAddrSpace;
}
-bool InferAddressSpaces::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+bool InferAddressSpacesImpl::run(Function &F) {
DL = &F.getParent()->getDataLayout();
if (AssumeDefaultIsFlatAddressSpace)
@@ -766,7 +785,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
// Constants need to be tracked through RAUW to handle cases with nested
// constant expressions, so wrap values in WeakTrackingVH.
-void InferAddressSpaces::inferAddressSpaces(
+void InferAddressSpacesImpl::inferAddressSpaces(
ArrayRef<WeakTrackingVH> Postorder,
ValueToAddrSpaceMapTy *InferredAddrSpace) const {
SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
@@ -810,7 +829,7 @@ void InferAddressSpaces::inferAddressSpaces(
}
}
-Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
assert(InferredAddrSpace.count(&V));
@@ -848,15 +867,24 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace(
else
NewAS = joinAddressSpaces(Src0AS, Src1AS);
} else {
- for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
- auto I = InferredAddrSpace.find(PtrOperand);
- unsigned OperandAS = I != InferredAddrSpace.end() ?
- I->second : PtrOperand->getType()->getPointerAddressSpace();
-
- // join(flat, *) = flat. So we can break if NewAS is already flat.
- NewAS = joinAddressSpaces(NewAS, OperandAS);
- if (NewAS == FlatAddrSpace)
- break;
+ unsigned AS = TTI->getAssumedAddrSpace(&V);
+ if (AS != UninitializedAddressSpace) {
+ // Use the assumed address space directly.
+ NewAS = AS;
+ } else {
+ // Otherwise, infer the address space from its pointer operands.
+ for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
+ auto I = InferredAddrSpace.find(PtrOperand);
+ unsigned OperandAS =
+ I != InferredAddrSpace.end()
+ ? I->second
+ : PtrOperand->getType()->getPointerAddressSpace();
+
+ // join(flat, *) = flat. So we can break if NewAS is already flat.
+ NewAS = joinAddressSpaces(NewAS, OperandAS);
+ if (NewAS == FlatAddrSpace)
+ break;
+ }
}
}
@@ -947,7 +975,8 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
// \p returns true if it is OK to change the address space of constant \p C with
// a ConstantExpr addrspacecast.
-bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+bool InferAddressSpacesImpl::isSafeToCastConstAddrSpace(Constant *C,
+ unsigned NewAS) const {
assert(NewAS != UninitializedAddressSpace);
unsigned SrcAS = C->getType()->getPointerAddressSpace();
@@ -986,7 +1015,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I,
return I;
}
-bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
// For each address expression to be modified, creates a clone of it with its
@@ -997,6 +1026,12 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
SmallVector<const Use *, 32> UndefUsesToFix;
for (Value* V : Postorder) {
unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+
+ // In some degenerate cases (e.g. invalid IR in unreachable code), we may
+ // not even infer the value to have its original address space.
+ if (NewAddrSpace == UninitializedAddressSpace)
+ continue;
+
if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
Value *New = cloneValueWithNewAddressSpace(
V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
@@ -1062,6 +1097,9 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
}
User *CurUser = U.getUser();
+ // Skip if the current user is the new value itself.
+ if (CurUser == NewV)
+ continue;
// Handle more complex cases like intrinsic that need to be remangled.
if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
@@ -1148,6 +1186,34 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
return true;
}
+bool InferAddressSpaces::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ return InferAddressSpacesImpl(
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+ FlatAddrSpace)
+ .run(F);
+}
+
FunctionPass *llvm::createInferAddressSpacesPass(unsigned AddressSpace) {
return new InferAddressSpaces(AddressSpace);
}
+
+InferAddressSpacesPass::InferAddressSpacesPass()
+ : FlatAddrSpace(UninitializedAddressSpace) {}
+InferAddressSpacesPass::InferAddressSpacesPass(unsigned AddressSpace)
+ : FlatAddrSpace(AddressSpace) {}
+
+PreservedAnalyses InferAddressSpacesPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed =
+ InferAddressSpacesImpl(&AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace)
+ .run(F);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index e87b622ab19f..c11d2e4c1d6b 100644
--- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -20,8 +20,10 @@
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/Local.h"
+
using namespace llvm;
#define DEBUG_TYPE "instsimplify"
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 9d0500419a7f..96aef90c1c1a 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -104,6 +105,11 @@ static cl::opt<bool> PrintLVIAfterJumpThreading(
cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
cl::Hidden);
+static cl::opt<bool> JumpThreadingFreezeSelectCond(
+ "jump-threading-freeze-select-cond",
+ cl::desc("Freeze the condition when unfolding select"), cl::init(false),
+ cl::Hidden);
+
static cl::opt<bool> ThreadAcrossLoopHeaders(
"jump-threading-across-loop-headers",
cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
@@ -133,7 +139,8 @@ namespace {
public:
static char ID; // Pass identification
- JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
+ JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1)
+ : FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) {
initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
}
@@ -147,6 +154,7 @@ namespace {
AU.addPreserved<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
void releaseMemory() override { Impl.releaseMemory(); }
@@ -166,11 +174,12 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
// Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
- return new JumpThreading(Threshold);
+FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) {
+ return new JumpThreading(InsertFr, Threshold);
}
-JumpThreadingPass::JumpThreadingPass(int T) {
+JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) {
+ InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr;
DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
}
@@ -304,6 +313,10 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
bool JumpThreading::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
+ auto TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ // Jump Threading has no sense for the targets with divergent CF
+ if (TTI->hasBranchDivergence())
+ return false;
auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
@@ -328,6 +341,10 @@ bool JumpThreading::runOnFunction(Function &F) {
PreservedAnalyses JumpThreadingPass::run(Function &F,
FunctionAnalysisManager &AM) {
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ // Jump Threading has no sense for the targets with divergent CF
+ if (TTI.hasBranchDivergence())
+ return PreservedAnalyses::all();
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
@@ -345,6 +362,11 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
std::move(BFI), std::move(BPI));
+ if (PrintLVIAfterJumpThreading) {
+ dbgs() << "LVI for function '" << F.getName() << "':\n";
+ LVI.printLVI(F, DTU.getDomTree(), dbgs());
+ }
+
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -397,7 +419,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
Unreachable.insert(&BB);
if (!ThreadAcrossLoopHeaders)
- FindLoopHeaders(F);
+ findLoopHeaders(F);
bool EverChanged = false;
bool Changed;
@@ -406,7 +428,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
for (auto &BB : F) {
if (Unreachable.count(&BB))
continue;
- while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
+ while (processBlock(&BB)) // Thread all of the branches we can over BB.
Changed = true;
// Jump threading may have introduced redundant debug values into BB
@@ -421,7 +443,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
continue;
if (pred_empty(&BB)) {
- // When ProcessBlock makes BB unreachable it doesn't bother to fix up
+ // When processBlock makes BB unreachable it doesn't bother to fix up
// the instructions in it. We must remove BB to prevent invalid IR.
LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
<< "' with terminator: " << *BB.getTerminator()
@@ -433,7 +455,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
continue;
}
- // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
+ // processBlock doesn't thread BBs with unconditional TIs. However, if BB
// is "almost empty", we attempt to merge BB with its sole successor.
auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
if (BI && BI->isUnconditional()) {
@@ -467,7 +489,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
// at the end of block. RAUW unconditionally replaces all uses
// including the guards/assumes themselves and the uses before the
// guard/assume.
-static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
assert(Cond->getType() == ToVal->getType());
auto *BB = Cond->getParent();
// We can unconditionally replace all uses in non-local blocks (i.e. uses
@@ -531,10 +553,18 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
// Debugger intrinsics don't incur code size.
if (isa<DbgInfoIntrinsic>(I)) continue;
+ // Pseudo-probes don't incur code size.
+ if (isa<PseudoProbeInst>(I))
+ continue;
+
// If this is a pointer->pointer bitcast, it is free.
if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
continue;
+ // Freeze instruction is free, too.
+ if (isa<FreezeInst>(I))
+ continue;
+
// Bail out if this instruction gives back a token type, it is not possible
// to duplicate it if it is used outside this BB.
if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
@@ -562,7 +592,7 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
return Size > Bonus ? Size - Bonus : 0;
}
-/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// findLoopHeaders - We do not want jump threading to turn proper loop
/// structures into irreducible loops. Doing this breaks up the loop nesting
/// hierarchy and pessimizes later transformations. To prevent this from
/// happening, we first have to find the loop headers. Here we approximate this
@@ -576,7 +606,7 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
/// within the loop (forming a nested loop). This simple analysis is not rich
/// enough to track all of these properties and keep it up-to-date as the CFG
/// mutates, so we don't allow any of these transformations.
-void JumpThreadingPass::FindLoopHeaders(Function &F) {
+void JumpThreadingPass::findLoopHeaders(Function &F) {
SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
FindFunctionBackedges(F, Edges);
@@ -603,13 +633,13 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
return dyn_cast<ConstantInt>(Val);
}
-/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// computeValueKnownInPredecessors - Given a basic block BB and a value V, see
/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
/// in any of our predecessors. If so, return the known list of value and pred
/// BB in the result vector.
///
/// This returns true if there were any known values.
-bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
+bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
Value *V, BasicBlock *BB, PredValueInfo &Result,
ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
Instruction *CxtI) {
@@ -674,13 +704,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
return !Result.empty();
}
- // Handle Cast instructions. Only see through Cast when the source operand is
- // PHI or Cmp to save the compilation time.
+ // Handle Cast instructions.
if (CastInst *CI = dyn_cast<CastInst>(I)) {
Value *Source = CI->getOperand(0);
- if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
- return false;
- ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
+ computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
RecursionSet, CxtI);
if (Result.empty())
return false;
@@ -692,6 +719,18 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
return true;
}
+ if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
+ Value *Source = FI->getOperand(0);
+ computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
+ RecursionSet, CxtI);
+
+ erase_if(Result, [](auto &Pair) {
+ return !isGuaranteedNotToBeUndefOrPoison(Pair.first);
+ });
+
+ return !Result.empty();
+ }
+
// Handle some boolean conditions.
if (I->getType()->getPrimitiveSizeInBits() == 1) {
assert(Preference == WantInteger && "One-bit non-integer type?");
@@ -701,9 +740,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
I->getOpcode() == Instruction::And) {
PredValueInfoTy LHSVals, RHSVals;
- ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
+ computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
WantInteger, RecursionSet, CxtI);
- ComputeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
+ computeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
WantInteger, RecursionSet, CxtI);
if (LHSVals.empty() && RHSVals.empty())
@@ -739,7 +778,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
if (I->getOpcode() == Instruction::Xor &&
isa<ConstantInt>(I->getOperand(1)) &&
cast<ConstantInt>(I->getOperand(1))->isOne()) {
- ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
+ computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
WantInteger, RecursionSet, CxtI);
if (Result.empty())
return false;
@@ -757,7 +796,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
&& "A binary operator creating a block address?");
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
PredValueInfoTy LHSVals;
- ComputeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
+ computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
WantInteger, RecursionSet, CxtI);
// Try to use constant folding to simplify the binary operator.
@@ -891,7 +930,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
// Try to find a constant value for the LHS of a comparison,
// and evaluate it statically if we can.
PredValueInfoTy LHSVals;
- ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
+ computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
WantInteger, RecursionSet, CxtI);
for (const auto &LHSVal : LHSVals) {
@@ -912,7 +951,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
PredValueInfoTy Conds;
if ((TrueVal || FalseVal) &&
- ComputeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
+ computeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
WantInteger, RecursionSet, CxtI)) {
for (auto &C : Conds) {
Constant *Cond = C.first;
@@ -940,7 +979,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
}
// If all else fails, see if LVI can figure out a constant value for us.
- Constant *CI = LVI->getConstant(V, BB, CxtI);
+ assert(CxtI->getParent() == BB && "CxtI should be in BB");
+ Constant *CI = LVI->getConstant(V, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference)) {
for (BasicBlock *Pred : predecessors(BB))
Result.emplace_back(KC, Pred);
@@ -954,7 +994,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
///
/// Since we can pick an arbitrary destination, we pick the successor with the
/// fewest predecessors. This should reduce the in-degree of the others.
-static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+static unsigned getBestDestForJumpOnUndef(BasicBlock *BB) {
Instruction *BBTerm = BB->getTerminator();
unsigned MinSucc = 0;
BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
@@ -982,9 +1022,9 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
return !BA->use_empty();
}
-/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// processBlock - If there are any predecessors whose control can be threaded
/// through to a successor, transform them now.
-bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
+bool JumpThreadingPass::processBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
if (DTU->isBBPendingDeletion(BB) ||
@@ -995,14 +1035,14 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// successor, merge the blocks. This encourages recursive jump threading
// because now the condition in this block can be threaded through
// predecessors of our predecessor block.
- if (MaybeMergeBasicBlockIntoOnlyPred(BB))
+ if (maybeMergeBasicBlockIntoOnlyPred(BB))
return true;
- if (TryToUnfoldSelectInCurrBB(BB))
+ if (tryToUnfoldSelectInCurrBB(BB))
return true;
// Look if we can propagate guards to predecessors.
- if (HasGuards && ProcessGuards(BB))
+ if (HasGuards && processGuards(BB))
return true;
// What kind of constant we're looking for.
@@ -1027,6 +1067,9 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
return false; // Must be an invoke or callbr.
}
+ // Keep track if we constant folded the condition in this invocation.
+ bool ConstantFolded = false;
+
// Run constant folding to see if we can reduce the condition to a simple
// constant.
if (Instruction *I = dyn_cast<Instruction>(Condition)) {
@@ -1037,13 +1080,16 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
if (isInstructionTriviallyDead(I, TLI))
I->eraseFromParent();
Condition = SimpleVal;
+ ConstantFolded = true;
}
}
- // If the terminator is branching on an undef, we can pick any of the
- // successors to branch to. Let GetBestDestForJumpOnUndef decide.
- if (isa<UndefValue>(Condition)) {
- unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+ // If the terminator is branching on an undef or freeze undef, we can pick any
+ // of the successors to branch to. Let getBestDestForJumpOnUndef decide.
+ auto *FI = dyn_cast<FreezeInst>(Condition);
+ if (isa<UndefValue>(Condition) ||
+ (FI && isa<UndefValue>(FI->getOperand(0)) && FI->hasOneUse())) {
+ unsigned BestSucc = getBestDestForJumpOnUndef(BB);
std::vector<DominatorTree::UpdateType> Updates;
// Fold the branch/switch.
@@ -1061,6 +1107,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
BBTerm->eraseFromParent();
DTU->applyUpdatesPermissive(Updates);
+ if (FI)
+ FI->eraseFromParent();
return true;
}
@@ -1073,6 +1121,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
<< '\n');
++NumFolds;
ConstantFoldTerminator(BB, true, nullptr, DTU);
+ if (HasProfileData)
+ BPI->eraseBlock(BB);
return true;
}
@@ -1081,9 +1131,9 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// All the rest of our checks depend on the condition being an instruction.
if (!CondInst) {
// FIXME: Unify this with code below.
- if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
+ if (processThreadableEdges(Condition, BB, Preference, Terminator))
return true;
- return false;
+ return ConstantFolded;
}
if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
@@ -1124,22 +1174,24 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
auto *CI = Ret == LazyValueInfo::True ?
ConstantInt::getTrue(CondCmp->getType()) :
ConstantInt::getFalse(CondCmp->getType());
- ReplaceFoldableUses(CondCmp, CI);
+ replaceFoldableUses(CondCmp, CI);
}
DTU->applyUpdatesPermissive(
{{DominatorTree::Delete, BB, ToRemoveSucc}});
+ if (HasProfileData)
+ BPI->eraseBlock(BB);
return true;
}
// We did not manage to simplify this branch, try to see whether
// CondCmp depends on a known phi-select pattern.
- if (TryToUnfoldSelect(CondCmp, BB))
+ if (tryToUnfoldSelect(CondCmp, BB))
return true;
}
}
if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
- if (TryToUnfoldSelect(SI, BB))
+ if (tryToUnfoldSelect(SI, BB))
return true;
// Check for some cases that are worth simplifying. Right now we want to look
@@ -1147,6 +1199,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// we see one, check to see if it's partially redundant. If so, insert a PHI
// which can then be used to thread the values.
Value *SimplifyValue = CondInst;
+
+ if (auto *FI = dyn_cast<FreezeInst>(SimplifyValue))
+ // Look into freeze's operand
+ SimplifyValue = FI->getOperand(0);
+
if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
if (isa<Constant>(CondCmp->getOperand(1)))
SimplifyValue = CondCmp->getOperand(0);
@@ -1154,7 +1211,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// TODO: There are other places where load PRE would be profitable, such as
// more complex comparisons.
if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
- if (SimplifyPartiallyRedundantLoad(LoadI))
+ if (simplifyPartiallyRedundantLoad(LoadI))
return true;
// Before threading, try to propagate profile data backwards:
@@ -1165,29 +1222,32 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// Handle a variety of cases where we are branching on something derived from
// a PHI node in the current block. If we can prove that any predecessors
// compute a predictable value based on a PHI node, thread those predecessors.
- if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
+ if (processThreadableEdges(CondInst, BB, Preference, Terminator))
return true;
- // If this is an otherwise-unfoldable branch on a phi node in the current
- // block, see if we can simplify.
- if (PHINode *PN = dyn_cast<PHINode>(CondInst))
- if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
- return ProcessBranchOnPHI(PN);
+ // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in
+ // the current block, see if we can simplify.
+ PHINode *PN = dyn_cast<PHINode>(
+ isa<FreezeInst>(CondInst) ? cast<FreezeInst>(CondInst)->getOperand(0)
+ : CondInst);
+
+ if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ return processBranchOnPHI(PN);
// If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
if (CondInst->getOpcode() == Instruction::Xor &&
CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
- return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+ return processBranchOnXOR(cast<BinaryOperator>(CondInst));
// Search for a stronger dominating condition that can be used to simplify a
// conditional branch leaving BB.
- if (ProcessImpliedCondition(BB))
+ if (processImpliedCondition(BB))
return true;
return false;
}
-bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
+bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
if (!BI || !BI->isConditional())
return false;
@@ -1217,6 +1277,8 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
UncondBI->setDebugLoc(BI->getDebugLoc());
BI->eraseFromParent();
DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
+ if (HasProfileData)
+ BPI->eraseBlock(BB);
return true;
}
CurrentBB = CurrentPred;
@@ -1234,11 +1296,11 @@ static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
return false;
}
-/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
+/// simplifyPartiallyRedundantLoad - If LoadI is an obviously partially
/// redundant load instruction, eliminate it by replacing it with a PHI node.
/// This is an important optimization that encourages jump threading, and needs
/// to be run interlaced with other jump threading tasks.
-bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
+bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// Don't hack volatile and ordered loads.
if (!LoadI->isUnordered()) return false;
@@ -1408,7 +1470,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
}
// Split them out to their own block.
- UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
+ UnavailablePred = splitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
}
// If the value isn't available in all predecessors, then there will be
@@ -1472,11 +1534,11 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
return true;
}
-/// FindMostPopularDest - The specified list contains multiple possible
+/// findMostPopularDest - The specified list contains multiple possible
/// threadable destinations. Pick the one that occurs the most frequently in
/// the list.
static BasicBlock *
-FindMostPopularDest(BasicBlock *BB,
+findMostPopularDest(BasicBlock *BB,
const SmallVectorImpl<std::pair<BasicBlock *,
BasicBlock *>> &PredToDestList) {
assert(!PredToDestList.empty());
@@ -1511,7 +1573,7 @@ FindMostPopularDest(BasicBlock *BB,
// Try to evaluate the value of V when the control flows from PredPredBB to
// BB->getSinglePredecessor() and then on to BB.
-Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
+Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB,
BasicBlock *PredPredBB,
Value *V) {
BasicBlock *PredBB = BB->getSinglePredecessor();
@@ -1538,9 +1600,9 @@ Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
if (CondCmp->getParent() == BB) {
Constant *Op0 =
- EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
+ evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
Constant *Op1 =
- EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
+ evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
if (Op0 && Op1) {
return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
}
@@ -1551,7 +1613,7 @@ Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
return nullptr;
}
-bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
ConstantPreference Preference,
Instruction *CxtI) {
// If threading this would thread across a loop header, don't even try to
@@ -1560,15 +1622,15 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
return false;
PredValueInfoTy PredValues;
- if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
+ if (!computeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
CxtI)) {
// We don't have known values in predecessors. See if we can thread through
// BB and its sole predecessor.
- return MaybeThreadThroughTwoBasicBlocks(BB, Cond);
+ return maybethreadThroughTwoBasicBlocks(BB, Cond);
}
assert(!PredValues.empty() &&
- "ComputeValueKnownInPredecessors returned true with no values");
+ "computeValueKnownInPredecessors returned true with no values");
LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
for (const auto &PredValue : PredValues) {
@@ -1660,6 +1722,8 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
BranchInst::Create(OnlyDest, Term);
Term->eraseFromParent();
DTU->applyUpdatesPermissive(Updates);
+ if (HasProfileData)
+ BPI->eraseBlock(BB);
// If the condition is now dead due to the removal of the old terminator,
// erase it.
@@ -1675,7 +1739,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
// guard/assume.
else if (OnlyVal && OnlyVal != MultipleVal &&
CondInst->getParent() == BB)
- ReplaceFoldableUses(CondInst, OnlyVal);
+ replaceFoldableUses(CondInst, OnlyVal);
}
return true;
}
@@ -1688,18 +1752,18 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
BasicBlock *MostPopularDest = OnlyDest;
if (MostPopularDest == MultipleDestSentinel) {
- // Remove any loop headers from the Dest list, ThreadEdge conservatively
+ // Remove any loop headers from the Dest list, threadEdge conservatively
// won't process them, but we might have other destination that are eligible
// and we still want to process.
erase_if(PredToDestList,
[&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
- return LoopHeaders.count(PredToDest.second) != 0;
+ return LoopHeaders.contains(PredToDest.second);
});
if (PredToDestList.empty())
return false;
- MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+ MostPopularDest = findMostPopularDest(BB, PredToDestList);
}
// Now that we know what the most popular destination is, factor all
@@ -1721,16 +1785,16 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
// the destination that these predecessors should get to.
if (!MostPopularDest)
MostPopularDest = BB->getTerminator()->
- getSuccessor(GetBestDestForJumpOnUndef(BB));
+ getSuccessor(getBestDestForJumpOnUndef(BB));
// Ok, try to thread it!
- return TryThreadEdge(BB, PredsToFactor, MostPopularDest);
+ return tryThreadEdge(BB, PredsToFactor, MostPopularDest);
}
-/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
-/// a PHI node in the current block. See if there are any simplifications we
-/// can do based on inputs to the phi node.
-bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
+/// processBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node (or freeze PHI) in the current block. See if there are any
+/// simplifications we can do based on inputs to the phi node.
+bool JumpThreadingPass::processBranchOnPHI(PHINode *PN) {
BasicBlock *BB = PN->getParent();
// TODO: We could make use of this to do it once for blocks with common PHI
@@ -1742,13 +1806,16 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
// *duplicate* the conditional branch into that block in order to further
// encourage jump threading and to eliminate cases where we have branch on a
// phi of an icmp (branch on icmp is much better).
+ // This is still beneficial when a frozen phi is used as the branch condition
+ // because it allows CodeGenPrepare to further canonicalize br(freeze(icmp))
+ // to br(icmp(freeze ...)).
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *PredBB = PN->getIncomingBlock(i);
if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
if (PredBr->isUnconditional()) {
PredBBs[0] = PredBB;
// Try to duplicate BB into PredBB.
- if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+ if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs))
return true;
}
}
@@ -1756,10 +1823,10 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
return false;
}
-/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// processBranchOnXOR - We have an otherwise unthreadable conditional branch on
/// a xor instruction in the current block. See if there are any
/// simplifications we can do based on inputs to the xor.
-bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
+bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) {
BasicBlock *BB = BO->getParent();
// If either the LHS or RHS of the xor is a constant, don't do this
@@ -1797,17 +1864,17 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
PredValueInfoTy XorOpValues;
bool isLHS = true;
- if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+ if (!computeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
WantInteger, BO)) {
assert(XorOpValues.empty());
- if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+ if (!computeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
WantInteger, BO))
return false;
isLHS = false;
}
assert(!XorOpValues.empty() &&
- "ComputeValueKnownInPredecessors returned true with no values");
+ "computeValueKnownInPredecessors returned true with no values");
// Scan the information to see which is most popular: true or false. The
// predecessors can be of the set true, false, or undef.
@@ -1859,14 +1926,22 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
return true;
}
+ // If any of predecessors end with an indirect goto, we can't change its
+ // destination. Same for CallBr.
+ if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
+ return isa<IndirectBrInst>(Pred->getTerminator()) ||
+ isa<CallBrInst>(Pred->getTerminator());
+ }))
+ return false;
+
// Try to duplicate BB into PredBB.
- return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+ return duplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
}
-/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// addPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
/// NewPred using the entries from OldPred (suitably mapped).
-static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
BasicBlock *OldPred,
BasicBlock *NewPred,
DenseMap<Instruction*, Value*> &ValueMap) {
@@ -1887,7 +1962,7 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
}
/// Merge basic block BB into its sole predecessor if possible.
-bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
+bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
BasicBlock *SinglePred = BB->getSinglePredecessor();
if (!SinglePred)
return false;
@@ -1938,7 +2013,7 @@ bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
/// Update the SSA form. NewBB contains instructions that are copied from BB.
/// ValueMapping maps old values in BB to new ones in NewBB.
-void JumpThreadingPass::UpdateSSA(
+void JumpThreadingPass::updateSSA(
BasicBlock *BB, BasicBlock *NewBB,
DenseMap<Instruction *, Value *> &ValueMapping) {
// If there were values defined in BB that are used outside the block, then we
@@ -1984,7 +2059,7 @@ void JumpThreadingPass::UpdateSSA(
/// arguments that come from PredBB. Return the map from the variables in the
/// source basic block to the variables in the newly created basic block.
DenseMap<Instruction *, Value *>
-JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
+JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
BasicBlock::iterator BE, BasicBlock *NewBB,
BasicBlock *PredBB) {
// We are going to have to map operands from the source basic block to the new
@@ -2023,7 +2098,7 @@ JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
}
/// Attempt to thread through two successive basic blocks.
-bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
+bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
Value *Cond) {
// Consider:
//
@@ -2092,7 +2167,7 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
BasicBlock *OnePred = nullptr;
for (BasicBlock *P : predecessors(PredBB)) {
if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
- EvaluateOnPredecessorEdge(BB, P, Cond))) {
+ evaluateOnPredecessorEdge(BB, P, Cond))) {
if (CI->isZero()) {
ZeroCount++;
ZeroPred = P;
@@ -2123,7 +2198,7 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
}
// If threading this would thread across a loop header, don't thread the edge.
- // See the comments above FindLoopHeaders for justifications and caveats.
+ // See the comments above findLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
LLVM_DEBUG({
bool BBIsHeader = LoopHeaders.count(BB);
@@ -2156,11 +2231,11 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
}
// Now we are ready to duplicate PredBB.
- ThreadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
+ threadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
return true;
}
-void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
+void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
BasicBlock *PredBB,
BasicBlock *BB,
BasicBlock *SuccBB) {
@@ -2186,7 +2261,11 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
// copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them
// to account for entry from PredPredBB.
DenseMap<Instruction *, Value *> ValueMapping =
- CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+ cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+
+ // Copy the edge probabilities from PredBB to NewBB.
+ if (HasProfileData)
+ BPI->copyEdgeProbabilities(PredBB, NewBB);
// Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
// This eliminates predecessors from PredPredBB, which requires us to simplify
@@ -2198,9 +2277,9 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
PredPredTerm->setSuccessor(i, NewBB);
}
- AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
+ addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
ValueMapping);
- AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
+ addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
ValueMapping);
DTU->applyUpdatesPermissive(
@@ -2209,7 +2288,7 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
{DominatorTree::Insert, PredPredBB, NewBB},
{DominatorTree::Delete, PredPredBB, PredBB}});
- UpdateSSA(PredBB, NewBB, ValueMapping);
+ updateSSA(PredBB, NewBB, ValueMapping);
// Clean up things like PHI nodes with single operands, dead instructions,
// etc.
@@ -2218,11 +2297,11 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
SmallVector<BasicBlock *, 1> PredsToFactor;
PredsToFactor.push_back(NewBB);
- ThreadEdge(BB, PredsToFactor, SuccBB);
+ threadEdge(BB, PredsToFactor, SuccBB);
}
-/// TryThreadEdge - Thread an edge if it's safe and profitable to do so.
-bool JumpThreadingPass::TryThreadEdge(
+/// tryThreadEdge - Thread an edge if it's safe and profitable to do so.
+bool JumpThreadingPass::tryThreadEdge(
BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
BasicBlock *SuccBB) {
// If threading to the same block as we come from, we would infinite loop.
@@ -2233,7 +2312,7 @@ bool JumpThreadingPass::TryThreadEdge(
}
// If threading this would thread across a loop header, don't thread the edge.
- // See the comments above FindLoopHeaders for justifications and caveats.
+ // See the comments above findLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
LLVM_DEBUG({
bool BBIsHeader = LoopHeaders.count(BB);
@@ -2254,14 +2333,14 @@ bool JumpThreadingPass::TryThreadEdge(
return false;
}
- ThreadEdge(BB, PredBBs, SuccBB);
+ threadEdge(BB, PredBBs, SuccBB);
return true;
}
-/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// threadEdge - We have decided that it is safe and profitable to factor the
/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
/// across BB. Transform the IR to reflect this change.
-void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
+void JumpThreadingPass::threadEdge(BasicBlock *BB,
const SmallVectorImpl<BasicBlock *> &PredBBs,
BasicBlock *SuccBB) {
assert(SuccBB != BB && "Don't create an infinite loop");
@@ -2276,7 +2355,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
else {
LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
- PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+ PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
}
// And finally, do it!
@@ -2300,7 +2379,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// Copy all the instructions from BB to NewBB except the terminator.
DenseMap<Instruction *, Value *> ValueMapping =
- CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
+ cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
// We didn't copy the terminator from BB over to NewBB, because there is now
// an unconditional jump to SuccBB. Insert the unconditional jump.
@@ -2309,7 +2388,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
// PHI nodes for NewBB now.
- AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+ addPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
// Update the terminator of PredBB to jump to NewBB instead of BB. This
// eliminates predecessors from BB, which requires us to simplify any PHI
@@ -2326,7 +2405,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
{DominatorTree::Insert, PredBB, NewBB},
{DominatorTree::Delete, PredBB, BB}});
- UpdateSSA(BB, NewBB, ValueMapping);
+ updateSSA(BB, NewBB, ValueMapping);
// At this point, the IR is fully up to date and consistent. Do a quick scan
// over the new instructions and zap any that are constants or dead. This
@@ -2334,7 +2413,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
SimplifyInstructionsInBlock(NewBB, TLI);
// Update the edge weight from BB to SuccBB, which should be less than before.
- UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+ updateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
// Threaded an edge!
++NumThreads;
@@ -2343,7 +2422,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
/// Create a new basic block that will be the predecessor of BB and successor of
/// all blocks in Preds. When profile data is available, update the frequency of
/// this new block.
-BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
+BasicBlock *JumpThreadingPass::splitBlockPreds(BasicBlock *BB,
ArrayRef<BasicBlock *> Preds,
const char *Suffix) {
SmallVector<BasicBlock *, 2> NewBBs;
@@ -2404,7 +2483,7 @@ bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
/// Update the block frequency of BB and branch weight and the metadata on the
/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
/// Freq(PredBB->BB) / Freq(BB->SuccBB).
-void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
BasicBlock *BB,
BasicBlock *NewBB,
BasicBlock *SuccBB) {
@@ -2496,18 +2575,18 @@ void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
}
}
-/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// duplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
/// If we can duplicate the contents of BB up into PredBB do so now, this
/// improves the odds that the branch will be on an analyzable instruction like
/// a compare.
-bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
+bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
assert(!PredBBs.empty() && "Can't handle an empty set");
// If BB is a loop header, then duplicating this block outside the loop would
// cause us to transform this into an irreducible loop, don't do this.
- // See the comments above FindLoopHeaders for justifications and caveats.
+ // See the comments above findLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB)) {
LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
<< "' into predecessor block '" << PredBBs[0]->getName()
@@ -2531,7 +2610,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
else {
LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
- PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+ PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
}
Updates.push_back({DominatorTree::Delete, PredBB, BB});
@@ -2603,12 +2682,12 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// Check to see if the targets of the branch had PHI nodes. If so, we need to
// add entries to the PHI nodes for branch from PredBB now.
BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
- AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+ addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
ValueMapping);
- AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+ addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
ValueMapping);
- UpdateSSA(BB, PredBB, ValueMapping);
+ updateSSA(BB, PredBB, ValueMapping);
// PredBB no longer jumps to BB, remove entries in the PHI node for the edge
// that we nuked.
@@ -2616,6 +2695,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// Remove the unconditional branch at the end of the PredBB block.
OldPredBranch->eraseFromParent();
+ if (HasProfileData)
+ BPI->copyEdgeProbabilities(BB, PredBB);
DTU->applyUpdatesPermissive(Updates);
++NumDupes;
@@ -2627,7 +2708,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// a PHI node in BB. SI has no other use.
// A new basic block, NewBB, is created and SI is converted to compare and
// conditional branch. SI is erased from parent.
-void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
+void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
SelectInst *SI, PHINode *SIUse,
unsigned Idx) {
// Expand the select.
@@ -2662,7 +2743,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
}
-bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
+bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());
if (!CondPHI || CondPHI->getParent() != BB)
@@ -2674,7 +2755,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
// The second and third condition can be potentially relaxed. Currently
// the conditions help to simplify the code and allow us to reuse existing
- // code, developed for TryToUnfoldSelect(CmpInst *, BasicBlock *)
+ // code, developed for tryToUnfoldSelect(CmpInst *, BasicBlock *)
if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
continue;
@@ -2682,13 +2763,13 @@ bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
if (!PredTerm || !PredTerm->isUnconditional())
continue;
- UnfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
+ unfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
return true;
}
return false;
}
-/// TryToUnfoldSelect - Look for blocks of the form
+/// tryToUnfoldSelect - Look for blocks of the form
/// bb1:
/// %a = select
/// br bb2
@@ -2700,7 +2781,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
///
/// And expand the select into a branch structure if one of its arms allows %c
/// to be folded. This later enables threading from bb1 over bb2.
-bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
@@ -2734,14 +2815,14 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
if ((LHSFolds != LazyValueInfo::Unknown ||
RHSFolds != LazyValueInfo::Unknown) &&
LHSFolds != RHSFolds) {
- UnfoldSelectInstr(Pred, BB, SI, CondLHS, I);
+ unfoldSelectInstr(Pred, BB, SI, CondLHS, I);
return true;
}
}
return false;
}
-/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// tryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
/// same BB in the form
/// bb:
/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
@@ -2761,19 +2842,14 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
/// select if the associated PHI has at least one constant. If the unfolded
/// select is not jump-threaded, it will be folded again in the later
/// optimizations.
-bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
- // This transform can introduce a UB (a conditional branch that depends on a
- // poison value) that was not present in the original program. See
- // @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll.
+bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+ // This transform would reduce the quality of msan diagnostics.
// Disable this transform under MemorySanitizer.
- // FIXME: either delete it or replace with a valid transform. This issue is
- // not limited to MemorySanitizer (but has only been observed as an MSan false
- // positive in practice so far).
if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
return false;
// If threading this would thread across a loop header, don't thread the edge.
- // See the comments above FindLoopHeaders for justifications and caveats.
+ // See the comments above findLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB))
return false;
@@ -2816,8 +2892,12 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
if (!SI)
continue;
// Expand the select.
- Instruction *Term =
- SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+ Value *Cond = SI->getCondition();
+ if (InsertFreezeWhenUnfoldingSelect &&
+ !isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI,
+ &DTU->getDomTree()))
+ Cond = new FreezeInst(Cond, "cond.fr", SI);
+ Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false);
BasicBlock *SplitBB = SI->getParent();
BasicBlock *NewBB = Term->getParent();
PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
@@ -2861,7 +2941,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
/// And cond either implies condGuard or !condGuard. In this case all the
/// instructions before the guard can be duplicated in both branches, and the
/// guard is then threaded to one of them.
-bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+bool JumpThreadingPass::processGuards(BasicBlock *BB) {
using namespace PatternMatch;
// We only want to deal with two predecessors.
@@ -2886,7 +2966,7 @@ bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
for (auto &I : *BB)
- if (isGuard(&I) && ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+ if (isGuard(&I) && threadGuard(BB, cast<IntrinsicInst>(&I), BI))
return true;
return false;
@@ -2895,7 +2975,7 @@ bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
/// Try to propagate the guard from BB which is the lower block of a diamond
/// to one of its branches, in case if diamond's condition implies guard's
/// condition.
-bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
BranchInst *BI) {
assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
assert(BI->isConditional() && "Unconditional branch has 2 successors?");
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 1a22edaf8726..d2b4ba296f41 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -12,6 +12,13 @@
// safe. This pass also promotes must-aliased memory locations in the loop to
// live in registers, thus hoisting and sinking "invariant" loads and stores.
//
+// Hoisting operations out of loops is a canonicalization transform. It
+// enables and simplifies subsequent optimizations in the middle-end.
+// Rematerialization of hoisted instructions to reduce register pressure is the
+// responsibility of the back-end, which has more accurate information about
+// register pressure and also handles other optimizations than LICM that
+// increase live-ranges.
+//
// This pass uses alias analysis for two purposes:
//
// 1. Moving loop invariant loads and calls out of loops. If we can determine
@@ -35,10 +42,12 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
@@ -98,6 +107,11 @@ static cl::opt<bool> ControlFlowHoisting(
"licm-control-flow-hoisting", cl::Hidden, cl::init(false),
cl::desc("Enable control flow (and PHI) hoisting in LICM"));
+static cl::opt<unsigned> HoistSinkColdnessThreshold(
+ "licm-coldness-threshold", cl::Hidden, cl::init(4),
+ cl::desc("Relative coldness Threshold of hoisting/sinking destination "
+ "block for LICM to be considered beneficial"));
+
static cl::opt<uint32_t> MaxNumUsesTraversed(
"licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
cl::desc("Max num uses visited for identifying load "
@@ -143,8 +157,9 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
OptimizationRemarkEmitter *ORE);
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
- const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE);
+ BlockFrequencyInfo *BFI, const Loop *CurLoop,
+ ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+ OptimizationRemarkEmitter *ORE);
static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
@@ -155,8 +170,10 @@ static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
AliasSetTracker *CurAST, Loop *CurLoop,
AAResults *AA);
static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
- Loop *CurLoop,
+ Loop *CurLoop, Instruction &I,
SinkAndHoistLICMFlags &Flags);
+static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
+ MemoryUse &MU);
static Instruction *cloneInstructionInExitBlock(
Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
@@ -171,8 +188,8 @@ static void moveInstructionBefore(Instruction &I, Instruction &Dest,
namespace {
struct LoopInvariantCodeMotion {
bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
- ScalarEvolution *SE, MemorySSA *MSSA,
+ BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI,
+ TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA,
OptimizationRemarkEmitter *ORE);
LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
@@ -204,23 +221,30 @@ struct LegacyLICMPass : public LoopPass {
if (skipLoop(L))
return false;
+ LLVM_DEBUG(dbgs() << "Perform LICM on Loop with header at block "
+ << L->getHeader()->getNameOrAsOperand() << "\n");
+
auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
MemorySSA *MSSA = EnableMSSALoopDependency
? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
: nullptr;
+ bool hasProfileData = L->getHeader()->getParent()->hasProfileData();
+ BlockFrequencyInfo *BFI =
+ hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
+ : nullptr;
// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
+ // pass. Function analyses need to be preserved across loop transformations
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
- return LICM.runOnLoop(L,
- &getAnalysis<AAResultsWrapperPass>().getAAResults(),
- &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
- &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent()),
- &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent()),
- SE ? &SE->getSE() : nullptr, MSSA, &ORE);
+ return LICM.runOnLoop(
+ L, &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+ &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), BFI,
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent()),
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent()),
+ SE ? &SE->getSE() : nullptr, MSSA, &ORE);
}
/// This transformation requires natural loop information & requires that
@@ -236,6 +260,9 @@ struct LegacyLICMPass : public LoopPass {
}
AU.addRequired<TargetTransformInfoWrapperPass>();
getLoopAnalysisUsage(AU);
+ LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+ AU.addPreserved<LazyBlockFrequencyInfoPass>();
+ AU.addPreserved<LazyBranchProbabilityInfoPass>();
}
private:
@@ -251,8 +278,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
- if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
- AR.MSSA, &ORE))
+ if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI,
+ &AR.SE, AR.MSSA, &ORE))
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
@@ -272,6 +299,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBFIPass)
INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
false)
@@ -281,13 +309,42 @@ Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
}
+llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L,
+ MemorySSA *MSSA)
+ : SinkAndHoistLICMFlags(SetLicmMssaOptCap, SetLicmMssaNoAccForPromotionCap,
+ IsSink, L, MSSA) {}
+
+llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(
+ unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap, bool IsSink,
+ Loop *L, MemorySSA *MSSA)
+ : LicmMssaOptCap(LicmMssaOptCap),
+ LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap),
+ IsSink(IsSink) {
+ assert(((L != nullptr) == (MSSA != nullptr)) &&
+ "Unexpected values for SinkAndHoistLICMFlags");
+ if (!MSSA)
+ return;
+
+ unsigned AccessCapCount = 0;
+ for (auto *BB : L->getBlocks())
+ if (const auto *Accesses = MSSA->getBlockAccesses(BB))
+ for (const auto &MA : *Accesses) {
+ (void)MA;
+ ++AccessCapCount;
+ if (AccessCapCount > LicmMssaNoAccForPromotionCap) {
+ NoOfMemAccTooLarge = true;
+ return;
+ }
+ }
+}
+
/// Hoist expressions out of the specified loop. Note, alias info for inner
/// loop is not preserved so it is not a good idea to run LICM multiple
/// times on one loop.
bool LoopInvariantCodeMotion::runOnLoop(
Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
- MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
+ BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+ ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
bool Changed = false;
assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -300,31 +357,18 @@ bool LoopInvariantCodeMotion::runOnLoop(
std::unique_ptr<AliasSetTracker> CurAST;
std::unique_ptr<MemorySSAUpdater> MSSAU;
- bool NoOfMemAccTooLarge = false;
- unsigned LicmMssaOptCounter = 0;
+ std::unique_ptr<SinkAndHoistLICMFlags> Flags;
if (!MSSA) {
LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
CurAST = collectAliasInfoForLoop(L, LI, AA);
+ Flags = std::make_unique<SinkAndHoistLICMFlags>(
+ LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true);
} else {
LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-
- unsigned AccessCapCount = 0;
- for (auto *BB : L->getBlocks()) {
- if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
- for (const auto &MA : *Accesses) {
- (void)MA;
- AccessCapCount++;
- if (AccessCapCount > LicmMssaNoAccForPromotionCap) {
- NoOfMemAccTooLarge = true;
- break;
- }
- }
- }
- if (NoOfMemAccTooLarge)
- break;
- }
+ Flags = std::make_unique<SinkAndHoistLICMFlags>(
+ LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA);
}
// Get the preheader block to move instructions into...
@@ -343,17 +387,15 @@ bool LoopInvariantCodeMotion::runOnLoop(
// that we are guaranteed to see definitions before we see uses. This allows
// us to sink instructions in one pass, without iteration. After sinking
// instructions, we perform another pass to hoist them out of the loop.
- SinkAndHoistLICMFlags Flags = {NoOfMemAccTooLarge, LicmMssaOptCounter,
- LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
- /*IsSink=*/true};
if (L->hasDedicatedExits())
- Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
- CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE);
- Flags.IsSink = false;
- if (Preheader)
Changed |=
- hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
- CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE);
+ sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
+ CurAST.get(), MSSAU.get(), &SafetyInfo, *Flags.get(), ORE);
+ Flags->setIsSink(false);
+ if (Preheader)
+ Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
+ CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
+ *Flags.get(), ORE);
// Now that all loop invariants have been removed from the loop, promote any
// memory references to scalars that we can.
@@ -363,7 +405,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
// preheader for SSA updater, so also avoid sinking when no preheader
// is available.
if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
- !NoOfMemAccTooLarge) {
+ !Flags->tooManyMemoryAccesses()) {
// Figure out the loop exits and their insertion points
SmallVector<BasicBlock *, 8> ExitBlocks;
L->getUniqueExitBlocks(ExitBlocks);
@@ -432,7 +474,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
// specifically moving instructions across the loop boundary and so it is
// especially in need of sanity checking here.
assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
- assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+ assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
"Parent loop not left in LCSSA form after LICM!");
if (MSSAU.get() && VerifyMemorySSA)
@@ -449,10 +491,10 @@ bool LoopInvariantCodeMotion::runOnLoop(
/// definitions, allowing us to sink a loop body in one pass without iteration.
///
bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
- DominatorTree *DT, TargetLibraryInfo *TLI,
- TargetTransformInfo *TTI, Loop *CurLoop,
- AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
- ICFLoopSafetyInfo *SafetyInfo,
+ DominatorTree *DT, BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
SinkAndHoistLICMFlags &Flags,
OptimizationRemarkEmitter *ORE) {
@@ -501,7 +543,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
ORE)) {
- if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) {
+ if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
if (!FreeInLoop) {
++II;
salvageDebugInfo(I);
@@ -585,7 +627,7 @@ public:
else if (!TrueDestSucc.empty()) {
Function *F = TrueDest->getParent();
auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); };
- auto It = std::find_if(F->begin(), F->end(), IsSucc);
+ auto It = llvm::find_if(*F, IsSucc);
assert(It != F->end() && "Could not find successor in function");
CommonSucc = &*It;
}
@@ -653,15 +695,15 @@ public:
return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
Pair.first->getSuccessor(1) == BB);
};
- auto It = std::find_if(HoistableBranches.begin(), HoistableBranches.end(),
- HasBBAsSuccessor);
+ auto It = llvm::find_if(HoistableBranches, HasBBAsSuccessor);
// If not involved in a pending branch, hoist to preheader
BasicBlock *InitialPreheader = CurLoop->getLoopPreheader();
if (It == HoistableBranches.end()) {
- LLVM_DEBUG(dbgs() << "LICM using " << InitialPreheader->getName()
- << " as hoist destination for " << BB->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << "LICM using "
+ << InitialPreheader->getNameOrAsOperand()
+ << " as hoist destination for "
+ << BB->getNameOrAsOperand() << "\n");
HoistDestinationMap[BB] = InitialPreheader;
return InitialPreheader;
}
@@ -746,13 +788,43 @@ public:
};
} // namespace
+// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
+// only worthwhile if the destination block is actually colder than current
+// block.
+static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
+ OptimizationRemarkEmitter *ORE,
+ BlockFrequencyInfo *BFI) {
+ // Check block frequency only when runtime profile is available
+ // to avoid pathological cases. With static profile, lean towards
+ // hosting because it helps canonicalize the loop for vectorizer.
+ if (!DstBlock->getParent()->hasProfileData())
+ return true;
+
+ if (!HoistSinkColdnessThreshold || !BFI)
+ return true;
+
+ BasicBlock *SrcBlock = I.getParent();
+ if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold >
+ BFI->getBlockFreq(SrcBlock).getFrequency()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I)
+ << "failed to sink or hoist instruction because containing block "
+ "has lower frequency than destination block";
+ });
+ return false;
+ }
+
+ return true;
+}
+
/// Walk the specified region of the CFG (defined by all blocks dominated by
/// the specified block, and that are in the current loop) in depth first
/// order w.r.t the DominatorTree. This allows us to visit definitions before
/// uses, allowing us to hoist a loop body in one pass without iteration.
///
bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
- DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ DominatorTree *DT, BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI, Loop *CurLoop,
AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
SinkAndHoistLICMFlags &Flags,
@@ -803,13 +875,15 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
// Try hoisting the instruction out to the preheader. We can only do
// this if all of the operands of the instruction are loop invariant and
- // if it is safe to hoist the instruction.
+ // if it is safe to hoist the instruction. We also check block frequency
+ // to make sure instruction only gets hoisted into colder blocks.
// TODO: It may be safe to hoist if we are hoisting to a conditional block
// and we have accurately duplicated the control flow from the loop header
// to that block.
if (CurLoop->hasLoopInvariantOperands(&I) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
ORE) &&
+ worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
isSafeToExecuteUnconditionally(
I, DT, CurLoop, SafetyInfo, ORE,
CurLoop->getLoopPreheader()->getTerminator())) {
@@ -908,7 +982,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
HoistPoint = Dominator->getTerminator();
}
LLVM_DEBUG(dbgs() << "LICM rehoisting to "
- << HoistPoint->getParent()->getName()
+ << HoistPoint->getParent()->getNameOrAsOperand()
<< ": " << *I << "\n");
moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE);
HoistPoint = I;
@@ -940,7 +1014,19 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
Loop *CurLoop) {
Value *Addr = LI->getOperand(0);
const DataLayout &DL = LI->getModule()->getDataLayout();
- const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+ const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+
+ // It is not currently possible for clang to generate an invariant.start
+ // intrinsic with scalable vector types because we don't support thread local
+ // sizeless types and we don't permit sizeless types in structs or classes.
+ // Furthermore, even if support is added for this in future the intrinsic
+ // itself is defined to have a size of -1 for variable sized objects. This
+ // makes it impossible to verify if the intrinsic envelops our region of
+ // interest. For example, both <vscale x 32 x i8> and <vscale x 16 x i8>
+ // types would have a -1 parameter, but the former is clearly double the size
+ // of the latter.
+ if (LocSizeInBits.isScalable())
+ return false;
// if the type is i8 addrspace(x)*, we know this is the type of
// llvm.invariant.start operand
@@ -970,13 +1056,17 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
!II->use_empty())
continue;
- unsigned InvariantSizeInBits =
- cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+ ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
+ // The intrinsic supports having a -1 argument for variable sized objects
+ // so we should check for that here.
+ if (InvariantSize->isNegative())
+ continue;
+ uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
// Confirm the invariant.start location size contains the load operand size
// in bits. Also, the invariant.start should dominate the load, and we
// should not hoist the load out of a loop that contains this dominating
// invariant.start.
- if (LocSizeInBits <= InvariantSizeInBits &&
+ if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
return true;
}
@@ -1041,6 +1131,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
bool TargetExecutesOncePerLoop,
SinkAndHoistLICMFlags *Flags,
OptimizationRemarkEmitter *ORE) {
+ assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
+ "Either AliasSetTracker or MemorySSA should be initialized.");
+
// If we don't understand the instruction, bail early.
if (!isHoistableAndSinkableInst(I))
return false;
@@ -1074,7 +1167,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
CurLoop, AA);
else
Invalidated = pointerInvalidatedByLoopWithMSSA(
- MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, *Flags);
+ MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags);
// Check loop-invariant address because this may also be a sinkable load
// whose address is not necessarily loop-invariant.
if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
@@ -1095,6 +1188,13 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
if (CI->mayThrow())
return false;
+ // Convergent attribute has been used on operations that involve
+ // inter-thread communication which results are implicitly affected by the
+ // enclosing control flows. It is not safe to hoist or sink such operations
+ // across control flow.
+ if (CI->isConvergent())
+ return false;
+
using namespace PatternMatch;
if (match(CI, m_Intrinsic<Intrinsic::assume>()))
// Assumes don't actually alias anything or throw
@@ -1119,11 +1219,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
bool Invalidated;
if (CurAST)
Invalidated = pointerInvalidatedByLoop(
- MemoryLocation(Op, LocationSize::unknown(), AAMDNodes()),
- CurAST, CurLoop, AA);
+ MemoryLocation::getBeforeOrAfter(Op), CurAST, CurLoop, AA);
else
Invalidated = pointerInvalidatedByLoopWithMSSA(
- MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop,
+ MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I,
*Flags);
if (Invalidated)
return false;
@@ -1183,12 +1282,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
} else { // MSSAU
if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
return true;
- // If there are more accesses than the Promotion cap, give up, we're not
- // walking a list that long.
- if (Flags->NoOfMemAccTooLarge)
- return false;
- // Check store only if there's still "quota" to check clobber.
- if (Flags->LicmMssaOptCounter >= Flags->LicmMssaOptCap)
+ // If there are more accesses than the Promotion cap or no "quota" to
+ // check clobber, then give up as we're not walking a list that long.
+ if (Flags->tooManyMemoryAccesses() || Flags->tooManyClobberingCalls())
return false;
// If there are interfering Uses (i.e. their defining access is in the
// loop), or ordered loads (stored as Defs!), don't move this store.
@@ -1208,7 +1304,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
// Uses may point to an access outside the loop, as getClobbering
// checks the previous iteration when walking the backedge.
// FIXME: More precise: no Uses that alias SI.
- if (!Flags->IsSink && !MSSA->dominates(SIMD, MU))
+ if (!Flags->getIsSink() && !MSSA->dominates(SIMD, MU))
return false;
} else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
@@ -1227,9 +1323,8 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
}
}
}
-
auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
- Flags->LicmMssaOptCounter++;
+ Flags->incrementClobberingCalls();
// If there are no clobbering Defs in the loop, store is safe to hoist.
return MSSA->isLiveOnEntryDef(Source) ||
!CurLoop->contains(Source->getBlock());
@@ -1529,8 +1624,9 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
/// position, and may either delete it or move it to outside of the loop.
///
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
- const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) {
+ BlockFrequencyInfo *BFI, const Loop *CurLoop,
+ ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+ OptimizationRemarkEmitter *ORE) {
LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
@@ -1606,7 +1702,10 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
// If this instruction is only used outside of the loop, then all users are
// PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
// the instruction.
+ // First check if I is worth sinking for all uses. Sink only when it is worth
+ // across all uses.
SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+ SmallVector<PHINode *, 8> ExitPNs;
for (auto *UI : Users) {
auto *User = cast<Instruction>(UI);
@@ -1616,6 +1715,15 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
PHINode *PN = cast<PHINode>(User);
assert(ExitBlockSet.count(PN->getParent()) &&
"The LCSSA PHI is not in an exit block!");
+ if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
+ return Changed;
+ }
+
+ ExitPNs.push_back(PN);
+ }
+
+ for (auto *PN : ExitPNs) {
+
// The PHI must be trivially replaceable.
Instruction *New = sinkThroughTriviallyReplaceablePHI(
PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
@@ -1633,8 +1741,8 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
OptimizationRemarkEmitter *ORE) {
- LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I
- << "\n");
+ LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": "
+ << I << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
<< ore::NV("Inst", &I);
@@ -1658,10 +1766,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
// Move the new node to the destination block, before its terminator.
moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE);
- // Apply line 0 debug locations when we are moving instructions to different
- // basic blocks because we want to avoid jumpy line tables.
- if (const DebugLoc &DL = I.getDebugLoc())
- I.setDebugLoc(DebugLoc::get(0, 0, DL.getScope(), DL.getInlinedAt()));
+ I.updateLocationAfterHoist();
if (isa<LoadInst>(I))
++NumMovedLoads;
@@ -1707,7 +1812,7 @@ class LoopPromoter : public LoadAndStorePromoter {
SmallVectorImpl<Instruction *> &LoopInsertPts;
SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
PredIteratorCache &PredCache;
- AliasSetTracker &AST;
+ AliasSetTracker *AST;
MemorySSAUpdater *MSSAU;
LoopInfo &LI;
DebugLoc DL;
@@ -1737,7 +1842,7 @@ public:
SmallVectorImpl<BasicBlock *> &LEB,
SmallVectorImpl<Instruction *> &LIP,
SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
- AliasSetTracker &ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
+ AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
DebugLoc dl, int alignment, bool UnorderedAtomic,
const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
: LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
@@ -1794,11 +1899,13 @@ public:
void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
// Update alias analysis.
- AST.copyValue(LI, V);
+ if (AST)
+ AST->copyValue(LI, V);
}
void instructionDeleted(Instruction *I) const override {
SafetyInfo.removeInstruction(I);
- AST.deleteValue(I);
+ if (AST)
+ AST->deleteValue(I);
if (MSSAU)
MSSAU->removeMemoryAccess(I);
}
@@ -1844,7 +1951,7 @@ bool llvm::promoteLoopAccessesToScalars(
ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
// Verify inputs.
assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
- CurAST != nullptr && SafetyInfo != nullptr &&
+ SafetyInfo != nullptr &&
"Unexpected Input to promoteLoopAccessesToScalars");
Value *SomePtr = *PointerMustAliases.begin();
@@ -1909,7 +2016,7 @@ bool llvm::promoteLoopAccessesToScalars(
// we have to prove that the store is dead along the unwind edge. We do
// this by proving that the caller can't have a reference to the object
// after return and thus can't possibly load from the object.
- Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ Value *Object = getUnderlyingObject(SomePtr);
if (!isKnownNonEscaping(Object, TLI))
return false;
// Subtlety: Alloca's aren't visible to callers, but *are* potentially
@@ -2041,7 +2148,7 @@ bool llvm::promoteLoopAccessesToScalars(
if (IsKnownThreadLocalObject)
SafeToInsertStore = true;
else {
- Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ Value *Object = getUnderlyingObject(SomePtr);
SafeToInsertStore =
(isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
!PointerMayBeCaptured(Object, true, true);
@@ -2072,7 +2179,7 @@ bool llvm::promoteLoopAccessesToScalars(
SmallVector<PHINode *, 16> NewPHIs;
SSAUpdater SSA(&NewPHIs);
LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
- InsertPts, MSSAInsertPts, PIC, *CurAST, MSSAU, *LI, DL,
+ InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL,
Alignment.value(), SawUnorderedAtomic, AATags,
*SafetyInfo);
@@ -2187,18 +2294,18 @@ static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
return false;
}
-static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
- Loop *CurLoop,
- SinkAndHoistLICMFlags &Flags) {
+bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
+ Loop *CurLoop, Instruction &I,
+ SinkAndHoistLICMFlags &Flags) {
// For hoisting, use the walker to determine safety
- if (!Flags.IsSink) {
+ if (!Flags.getIsSink()) {
MemoryAccess *Source;
// See declaration of SetLicmMssaOptCap for usage details.
- if (Flags.LicmMssaOptCounter >= Flags.LicmMssaOptCap)
+ if (Flags.tooManyClobberingCalls())
Source = MU->getDefiningAccess();
else {
Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(MU);
- Flags.LicmMssaOptCounter++;
+ Flags.incrementClobberingCalls();
}
return !MSSA->isLiveOnEntryDef(Source) &&
CurLoop->contains(Source->getBlock());
@@ -2221,15 +2328,25 @@ static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
// FIXME: Increase precision: Safe to sink if Use post dominates the Def;
// needs PostDominatorTreeAnalysis.
// FIXME: More precise: no Defs that alias this Use.
- if (Flags.NoOfMemAccTooLarge)
+ if (Flags.tooManyMemoryAccesses())
return true;
for (auto *BB : CurLoop->getBlocks())
- if (auto *Accesses = MSSA->getBlockDefs(BB))
- for (const auto &MA : *Accesses)
- if (const auto *MD = dyn_cast<MemoryDef>(&MA))
- if (MU->getBlock() != MD->getBlock() ||
- !MSSA->locallyDominates(MD, MU))
- return true;
+ if (pointerInvalidatedByBlockWithMSSA(*BB, *MSSA, *MU))
+ return true;
+ // When sinking, the source block may not be part of the loop so check it.
+ if (!CurLoop->contains(&I))
+ return pointerInvalidatedByBlockWithMSSA(*I.getParent(), *MSSA, *MU);
+
+ return false;
+}
+
+bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
+ MemoryUse &MU) {
+ if (const auto *Accesses = MSSA.getBlockDefs(&BB))
+ for (const auto &MA : *Accesses)
+ if (const auto *MD = dyn_cast<MemoryDef>(&MA))
+ if (MU.getBlock() != MD->getBlock() || !MSSA.locallyDominates(MD, &MU))
+ return true;
return false;
}
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 687e14d6d7d2..45cdcb2f37dd 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -271,7 +271,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
bool MadeChange = false;
// Only prefetch in the inner-most loop
- if (!L->empty())
+ if (!L->isInnermost())
return MadeChange;
SmallPtrSet<const Value *, 32> EphValues;
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index be209d34be42..1266c93316fa 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -26,6 +26,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+
using namespace llvm;
#define DEBUG_TYPE "loop-delete"
@@ -38,6 +39,14 @@ enum class LoopDeletionResult {
Deleted,
};
+static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) {
+ if (A == LoopDeletionResult::Deleted || B == LoopDeletionResult::Deleted)
+ return LoopDeletionResult::Deleted;
+ if (A == LoopDeletionResult::Modified || B == LoopDeletionResult::Modified)
+ return LoopDeletionResult::Modified;
+ return LoopDeletionResult::Unmodified;
+}
+
/// Determines if a loop is dead.
///
/// This assumes that we've already checked for unique exit and exiting blocks,
@@ -53,26 +62,28 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
// of the loop.
bool AllEntriesInvariant = true;
bool AllOutgoingValuesSame = true;
- for (PHINode &P : ExitBlock->phis()) {
- Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
-
- // Make sure all exiting blocks produce the same incoming value for the exit
- // block. If there are different incoming values for different exiting
- // blocks, then it is impossible to statically determine which value should
- // be used.
- AllOutgoingValuesSame =
- all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
- return incoming == P.getIncomingValueForBlock(BB);
- });
-
- if (!AllOutgoingValuesSame)
- break;
-
- if (Instruction *I = dyn_cast<Instruction>(incoming))
- if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
- AllEntriesInvariant = false;
+ if (!L->hasNoExitBlocks()) {
+ for (PHINode &P : ExitBlock->phis()) {
+ Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
+
+ // Make sure all exiting blocks produce the same incoming value for the
+ // block. If there are different incoming values for different exiting
+ // blocks, then it is impossible to statically determine which value
+ // should be used.
+ AllOutgoingValuesSame =
+ all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
+ return incoming == P.getIncomingValueForBlock(BB);
+ });
+
+ if (!AllOutgoingValuesSame)
break;
- }
+
+ if (Instruction *I = dyn_cast<Instruction>(incoming))
+ if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
+ AllEntriesInvariant = false;
+ break;
+ }
+ }
}
if (Changed)
@@ -85,7 +96,9 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
// This includes instructions that could write to memory, and loads that are
// marked volatile.
for (auto &I : L->blocks())
- if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ if (any_of(*I, [](Instruction &I) {
+ return I.mayHaveSideEffects() && !I.isDroppable();
+ }))
return false;
return true;
}
@@ -122,12 +135,33 @@ static bool isLoopNeverExecuted(Loop *L) {
return true;
}
+/// If we can prove the backedge is untaken, remove it. This destroys the
+/// loop, but leaves the (now trivially loop invariant) control flow and
+/// side effects (if any) in place.
+static LoopDeletionResult
+breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+ LoopInfo &LI, MemorySSA *MSSA,
+ OptimizationRemarkEmitter &ORE) {
+ assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+ if (!L->getLoopLatch())
+ return LoopDeletionResult::Unmodified;
+
+ auto *BTC = SE.getBackedgeTakenCount(L);
+ if (!BTC->isZero())
+ return LoopDeletionResult::Unmodified;
+
+ breakLoopBackedge(L, DT, SE, LI, MSSA);
+ return LoopDeletionResult::Deleted;
+}
+
/// Remove a loop if it is dead.
///
-/// A loop is considered dead if it does not impact the observable behavior of
-/// the program other than finite running time. This never removes a loop that
-/// might be infinite (unless it is never executed), as doing so could change
-/// the halting/non-halting nature of a program.
+/// A loop is considered dead either if it does not impact the observable
+/// behavior of the program other than finite running time, or if it is
+/// required to make progress by an attribute such as 'mustprogress' or
+/// 'llvm.loop.mustprogress' and does not make any. This may remove
+/// infinite loops that have been required to make progress.
///
/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
/// order to make various safety checks work.
@@ -151,18 +185,15 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
<< "Deletion requires Loop with preheader and dedicated exits.\n");
return LoopDeletionResult::Unmodified;
}
- // We can't remove loops that contain subloops. If the subloops were dead,
- // they would already have been removed in earlier executions of this pass.
- if (L->begin() != L->end()) {
- LLVM_DEBUG(dbgs() << "Loop contains subloops.\n");
- return LoopDeletionResult::Unmodified;
- }
-
BasicBlock *ExitBlock = L->getUniqueExitBlock();
if (ExitBlock && isLoopNeverExecuted(L)) {
LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+ // We need to forget the loop before setting the incoming values of the exit
+ // phis to undef, so we properly invalidate the SCEV expressions for those
+ // phis.
+ SE.forgetLoop(L);
// Set incoming value to undef for phi nodes in the exit block.
for (PHINode &P : ExitBlock->phis()) {
std::fill(P.incoming_values().begin(), P.incoming_values().end(),
@@ -183,12 +214,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
SmallVector<BasicBlock *, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
- // We require that the loop only have a single exit block. Otherwise, we'd
- // be in the situation of needing to be able to solve statically which exit
- // block will be branched to, or trying to preserve the branching logic in
- // a loop invariant manner.
- if (!ExitBlock) {
- LLVM_DEBUG(dbgs() << "Deletion requires single exit block\n");
+ // We require that the loop has at most one exit block. Otherwise, we'd be in
+ // the situation of needing to be able to solve statically which exit block
+ // will be branched to, or trying to preserve the branching logic in a loop
+ // invariant manner.
+ if (!ExitBlock && !L->hasNoExitBlocks()) {
+ LLVM_DEBUG(dbgs() << "Deletion requires at most one exit block.\n");
return LoopDeletionResult::Unmodified;
}
// Finally, we have to check that the loop really is dead.
@@ -199,11 +230,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
: LoopDeletionResult::Unmodified;
}
- // Don't remove loops for which we can't solve the trip count.
- // They could be infinite, in which case we'd be changing program behavior.
+ // Don't remove loops for which we can't solve the trip count unless the loop
+ // was required to make progress but has been determined to be dead.
const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(S)) {
- LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+ if (isa<SCEVCouldNotCompute>(S) &&
+ !L->getHeader()->getParent()->mustProgress() && !hasMustProgress(L)) {
+ LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount and was "
+ "not required to make progress.\n");
return Changed ? LoopDeletionResult::Modified
: LoopDeletionResult::Unmodified;
}
@@ -232,6 +265,14 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE);
+
+ // If we can prove the backedge isn't taken, just break it and be done. This
+ // leaves the loop structure in place which means it can handle dispatching
+ // to the right exit based on whatever loop invariant structure remains.
+ if (Result != LoopDeletionResult::Deleted)
+ Result = merge(Result, breakBackedgeIfNotTaken(&L, AR.DT, AR.SE, AR.LI,
+ AR.MSSA, ORE));
+
if (Result == LoopDeletionResult::Unmodified)
return PreservedAnalyses::all();
@@ -291,6 +332,12 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE);
+ // If we can prove the backedge isn't taken, just break it and be done. This
+ // leaves the loop structure in place which means it can handle dispatching
+ // to the right exit based on whatever loop invariant structure remains.
+ if (Result != LoopDeletionResult::Deleted)
+ Result = merge(Result, breakBackedgeIfNotTaken(L, DT, SE, LI, MSSA, ORE));
+
if (Result == LoopDeletionResult::Deleted)
LPM.markLoopAsDeleted(*L);
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 7867a5468891..1bd2529891b7 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -33,7 +33,6 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
@@ -664,21 +663,23 @@ public:
/// Try to distribute an inner-most loop.
bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
- assert(L->empty() && "Only process inner loops.");
+ assert(L->isInnermost() && "Only process inner loops.");
LLVM_DEBUG(dbgs() << "\nLDist: In \""
<< L->getHeader()->getParent()->getName()
<< "\" checking " << *L << "\n");
+ // Having a single exit block implies there's also one exiting block.
if (!L->getExitBlock())
return fail("MultipleExitBlocks", "multiple exit blocks");
if (!L->isLoopSimplifyForm())
return fail("NotLoopSimplifyForm",
"loop is not in loop-simplify form");
+ if (!L->isRotatedForm())
+ return fail("NotBottomTested", "loop is not bottom tested");
BasicBlock *PH = L->getLoopPreheader();
- // LAA will check that we only have a single exiting block.
LAI = &GetLAA(*L);
// Currently, we only distribute to isolate the part of the loop with
@@ -814,9 +815,7 @@ public:
LLVM_DEBUG(dbgs() << "\nPointers:\n");
LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
- LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
- LVer.setAliasChecks(std::move(Checks));
- LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
+ LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
LVer.versionLoop(DefsUsedOutside);
LVer.annotateLoopWithNoAlias();
@@ -982,7 +981,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
for (Loop *TopLevelLoop : *LI)
for (Loop *L : depth_first(TopLevelLoop))
// We only handle inner-most loops.
- if (L->empty())
+ if (L->isInnermost())
Worklist.push_back(L);
// Now walk the identified inner loops.
@@ -1058,7 +1057,8 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
+ TLI, TTI, nullptr, nullptr};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
new file mode 100644
index 000000000000..aaff68436c13
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -0,0 +1,728 @@
+//===- LoopFlatten.cpp - Loop flattening pass------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass flattens pairs nested loops into a single loop.
+//
+// The intention is to optimise loop nests like this, which together access an
+// array linearly:
+// for (int i = 0; i < N; ++i)
+// for (int j = 0; j < M; ++j)
+// f(A[i*M+j]);
+// into one loop:
+// for (int i = 0; i < (N*M); ++i)
+// f(A[i]);
+//
+// It can also flatten loops where the induction variables are not used in the
+// loop. This is only worth doing if the induction variables are only used in an
+// expression like i*M+j. If they had any other uses, we would have to insert a
+// div/mod to reconstruct the original values, so this wouldn't be profitable.
+//
+// We also need to prove that N*M will not overflow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+
+#define DEBUG_TYPE "loop-flatten"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> RepeatedInstructionThreshold(
+ "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
+ cl::desc("Limit on the cost of instructions that can be repeated due to "
+ "loop flattening"));
+
+static cl::opt<bool>
+ AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden,
+ cl::init(false),
+ cl::desc("Assume that the product of the two iteration "
+ "limits will never overflow"));
+
+static cl::opt<bool>
+ WidenIV("loop-flatten-widen-iv", cl::Hidden,
+ cl::init(true),
+ cl::desc("Widen the loop induction variables, if possible, so "
+ "overflow checks won't reject flattening"));
+
+struct FlattenInfo {
+ Loop *OuterLoop = nullptr;
+ Loop *InnerLoop = nullptr;
+ PHINode *InnerInductionPHI = nullptr;
+ PHINode *OuterInductionPHI = nullptr;
+ Value *InnerLimit = nullptr;
+ Value *OuterLimit = nullptr;
+ BinaryOperator *InnerIncrement = nullptr;
+ BinaryOperator *OuterIncrement = nullptr;
+ BranchInst *InnerBranch = nullptr;
+ BranchInst *OuterBranch = nullptr;
+ SmallPtrSet<Value *, 4> LinearIVUses;
+ SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
+
+ // Whether this holds the flatten info before or after widening.
+ bool Widened = false;
+
+ FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
+};
+
+// Finds the induction variable, increment and limit for a simple loop that we
+// can flatten.
+static bool findLoopComponents(
+ Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
+ PHINode *&InductionPHI, Value *&Limit, BinaryOperator *&Increment,
+ BranchInst *&BackBranch, ScalarEvolution *SE) {
+ LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
+
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Loop is not in normal form\n");
+ return false;
+ }
+
+ // There must be exactly one exiting block, and it must be the same at the
+ // latch.
+ BasicBlock *Latch = L->getLoopLatch();
+ if (L->getExitingBlock() != Latch) {
+ LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n");
+ return false;
+ }
+ // Latch block must end in a conditional branch.
+ BackBranch = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!BackBranch || !BackBranch->isConditional()) {
+ LLVM_DEBUG(dbgs() << "Could not find back-branch\n");
+ return false;
+ }
+ IterationInstructions.insert(BackBranch);
+ LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
+ bool ContinueOnTrue = L->contains(BackBranch->getSuccessor(0));
+
+ // Find the induction PHI. If there is no induction PHI, we can't do the
+ // transformation. TODO: could other variables trigger this? Do we have to
+ // search for the best one?
+ InductionPHI = nullptr;
+ for (PHINode &PHI : L->getHeader()->phis()) {
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) {
+ InductionPHI = &PHI;
+ LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump());
+ break;
+ }
+ }
+ if (!InductionPHI) {
+ LLVM_DEBUG(dbgs() << "Could not find induction PHI\n");
+ return false;
+ }
+
+ auto IsValidPredicate = [&](ICmpInst::Predicate Pred) {
+ if (ContinueOnTrue)
+ return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT;
+ else
+ return Pred == CmpInst::ICMP_EQ;
+ };
+
+ // Find Compare and make sure it is valid
+ ICmpInst *Compare = dyn_cast<ICmpInst>(BackBranch->getCondition());
+ if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) ||
+ Compare->hasNUsesOrMore(2)) {
+ LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
+ return false;
+ }
+ IterationInstructions.insert(Compare);
+ LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump());
+
+ // Find increment and limit from the compare
+ Increment = nullptr;
+ if (match(Compare->getOperand(0),
+ m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+ Increment = dyn_cast<BinaryOperator>(Compare->getOperand(0));
+ Limit = Compare->getOperand(1);
+ } else if (Compare->getUnsignedPredicate() == CmpInst::ICMP_NE &&
+ match(Compare->getOperand(1),
+ m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+ Increment = dyn_cast<BinaryOperator>(Compare->getOperand(1));
+ Limit = Compare->getOperand(0);
+ }
+ if (!Increment || Increment->hasNUsesOrMore(3)) {
+ LLVM_DEBUG(dbgs() << "Cound not find valid increment\n");
+ return false;
+ }
+ IterationInstructions.insert(Increment);
+ LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
+ LLVM_DEBUG(dbgs() << "Found limit: "; Limit->dump());
+
+ assert(InductionPHI->getNumIncomingValues() == 2);
+ assert(InductionPHI->getIncomingValueForBlock(Latch) == Increment &&
+ "PHI value is not increment inst");
+
+ auto *CI = dyn_cast<ConstantInt>(
+ InductionPHI->getIncomingValueForBlock(L->getLoopPreheader()));
+ if (!CI || !CI->isZero()) {
+ LLVM_DEBUG(dbgs() << "PHI value is not zero: "; CI->dump());
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
+ return true;
+}
+
+static bool checkPHIs(struct FlattenInfo &FI,
+ const TargetTransformInfo *TTI) {
+ // All PHIs in the inner and outer headers must either be:
+ // - The induction PHI, which we are going to rewrite as one induction in
+ // the new loop. This is already checked by findLoopComponents.
+ // - An outer header PHI with all incoming values from outside the loop.
+ // LoopSimplify guarantees we have a pre-header, so we don't need to
+ // worry about that here.
+ // - Pairs of PHIs in the inner and outer headers, which implement a
+ // loop-carried dependency that will still be valid in the new loop. To
+ // be valid, this variable must be modified only in the inner loop.
+
+ // The set of PHI nodes in the outer loop header that we know will still be
+ // valid after the transformation. These will not need to be modified (with
+ // the exception of the induction variable), but we do need to check that
+ // there are no unsafe PHI nodes.
+ SmallPtrSet<PHINode *, 4> SafeOuterPHIs;
+ SafeOuterPHIs.insert(FI.OuterInductionPHI);
+
+ // Check that all PHI nodes in the inner loop header match one of the valid
+ // patterns.
+ for (PHINode &InnerPHI : FI.InnerLoop->getHeader()->phis()) {
+ // The induction PHIs break these rules, and that's OK because we treat
+ // them specially when doing the transformation.
+ if (&InnerPHI == FI.InnerInductionPHI)
+ continue;
+
+ // Each inner loop PHI node must have two incoming values/blocks - one
+ // from the pre-header, and one from the latch.
+ assert(InnerPHI.getNumIncomingValues() == 2);
+ Value *PreHeaderValue =
+ InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopPreheader());
+ Value *LatchValue =
+ InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopLatch());
+
+ // The incoming value from the outer loop must be the PHI node in the
+ // outer loop header, with no modifications made in the top of the outer
+ // loop.
+ PHINode *OuterPHI = dyn_cast<PHINode>(PreHeaderValue);
+ if (!OuterPHI || OuterPHI->getParent() != FI.OuterLoop->getHeader()) {
+ LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n");
+ return false;
+ }
+
+ // The other incoming value must come from the inner loop, without any
+ // modifications in the tail end of the outer loop. We are in LCSSA form,
+ // so this will actually be a PHI in the inner loop's exit block, which
+ // only uses values from inside the inner loop.
+ PHINode *LCSSAPHI = dyn_cast<PHINode>(
+ OuterPHI->getIncomingValueForBlock(FI.OuterLoop->getLoopLatch()));
+ if (!LCSSAPHI) {
+ LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n");
+ return false;
+ }
+
+ // The value used by the LCSSA PHI must be the same one that the inner
+ // loop's PHI uses.
+ if (LCSSAPHI->hasConstantValue() != LatchValue) {
+ LLVM_DEBUG(
+ dbgs() << "LCSSA PHI incoming value does not match latch value\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "PHI pair is safe:\n");
+ LLVM_DEBUG(dbgs() << " Inner: "; InnerPHI.dump());
+ LLVM_DEBUG(dbgs() << " Outer: "; OuterPHI->dump());
+ SafeOuterPHIs.insert(OuterPHI);
+ FI.InnerPHIsToTransform.insert(&InnerPHI);
+ }
+
+ for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) {
+ if (!SafeOuterPHIs.count(&OuterPHI)) {
+ LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "checkPHIs: OK\n");
+ return true;
+}
+
+static bool
+checkOuterLoopInsts(struct FlattenInfo &FI,
+ SmallPtrSetImpl<Instruction *> &IterationInstructions,
+ const TargetTransformInfo *TTI) {
+ // Check for instructions in the outer but not inner loop. If any of these
+ // have side-effects then this transformation is not legal, and if there is
+ // a significant amount of code here which can't be optimised out that it's
+ // not profitable (as these instructions would get executed for each
+ // iteration of the inner loop).
+ unsigned RepeatedInstrCost = 0;
+ for (auto *B : FI.OuterLoop->getBlocks()) {
+ if (FI.InnerLoop->contains(B))
+ continue;
+
+ for (auto &I : *B) {
+ if (!isa<PHINode>(&I) && !I.isTerminator() &&
+ !isSafeToSpeculativelyExecute(&I)) {
+ LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have "
+ "side effects: ";
+ I.dump());
+ return false;
+ }
+ // The execution count of the outer loop's iteration instructions
+ // (increment, compare and branch) will be increased, but the
+ // equivalent instructions will be removed from the inner loop, so
+ // they make a net difference of zero.
+ if (IterationInstructions.count(&I))
+ continue;
+ // The uncoditional branch to the inner loop's header will turn into
+ // a fall-through, so adds no cost.
+ BranchInst *Br = dyn_cast<BranchInst>(&I);
+ if (Br && Br->isUnconditional() &&
+ Br->getSuccessor(0) == FI.InnerLoop->getHeader())
+ continue;
+ // Multiplies of the outer iteration variable and inner iteration
+ // count will be optimised out.
+ if (match(&I, m_c_Mul(m_Specific(FI.OuterInductionPHI),
+ m_Specific(FI.InnerLimit))))
+ continue;
+ int Cost = TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+ LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
+ RepeatedInstrCost += Cost;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: "
+ << RepeatedInstrCost << "\n");
+ // Bail out if flattening the loops would cause instructions in the outer
+ // loop but not in the inner loop to be executed extra times.
+ if (RepeatedInstrCost > RepeatedInstructionThreshold) {
+ LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: not profitable, bailing.\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: OK\n");
+ return true;
+}
+
+static bool checkIVUsers(struct FlattenInfo &FI) {
+ // We require all uses of both induction variables to match this pattern:
+ //
+ // (OuterPHI * InnerLimit) + InnerPHI
+ //
+ // Any uses of the induction variables not matching that pattern would
+ // require a div/mod to reconstruct in the flattened loop, so the
+ // transformation wouldn't be profitable.
+
+ Value *InnerLimit = FI.InnerLimit;
+ if (FI.Widened &&
+ (isa<SExtInst>(InnerLimit) || isa<ZExtInst>(InnerLimit)))
+ InnerLimit = cast<Instruction>(InnerLimit)->getOperand(0);
+
+ // Check that all uses of the inner loop's induction variable match the
+ // expected pattern, recording the uses of the outer IV.
+ SmallPtrSet<Value *, 4> ValidOuterPHIUses;
+ for (User *U : FI.InnerInductionPHI->users()) {
+ if (U == FI.InnerIncrement)
+ continue;
+
+ // After widening the IVs, a trunc instruction might have been introduced, so
+ // look through truncs.
+ if (isa<TruncInst>(U)) {
+ if (!U->hasOneUse())
+ return false;
+ U = *U->user_begin();
+ }
+
+ LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
+
+ Value *MatchedMul;
+ Value *MatchedItCount;
+ bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
+ m_Value(MatchedMul))) &&
+ match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
+ m_Value(MatchedItCount)));
+
+ // Matches the same pattern as above, except it also looks for truncs
+ // on the phi, which can be the result of widening the induction variables.
+ bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
+ m_Value(MatchedMul))) &&
+ match(MatchedMul,
+ m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
+ m_Value(MatchedItCount)));
+
+ if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerLimit) {
+ LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+ ValidOuterPHIUses.insert(MatchedMul);
+ FI.LinearIVUses.insert(U);
+ } else {
+ LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+ return false;
+ }
+ }
+
+ // Check that there are no uses of the outer IV other than the ones found
+ // as part of the pattern above.
+ for (User *U : FI.OuterInductionPHI->users()) {
+ if (U == FI.OuterIncrement)
+ continue;
+
+ auto IsValidOuterPHIUses = [&] (User *U) -> bool {
+ LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+ if (!ValidOuterPHIUses.count(U)) {
+ LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+ return true;
+ };
+
+ if (auto *V = dyn_cast<TruncInst>(U)) {
+ for (auto *K : V->users()) {
+ if (!IsValidOuterPHIUses(K))
+ return false;
+ }
+ continue;
+ }
+
+ if (!IsValidOuterPHIUses(U))
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
+ dbgs() << "Found " << FI.LinearIVUses.size()
+ << " value(s) that can be replaced:\n";
+ for (Value *V : FI.LinearIVUses) {
+ dbgs() << " ";
+ V->dump();
+ });
+ return true;
+}
+
+// Return an OverflowResult dependant on if overflow of the multiplication of
+// InnerLimit and OuterLimit can be assumed not to happen.
+static OverflowResult checkOverflow(struct FlattenInfo &FI,
+ DominatorTree *DT, AssumptionCache *AC) {
+ Function *F = FI.OuterLoop->getHeader()->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // For debugging/testing.
+ if (AssumeNoOverflow)
+ return OverflowResult::NeverOverflows;
+
+ // Check if the multiply could not overflow due to known ranges of the
+ // input values.
+ OverflowResult OR = computeOverflowForUnsignedMul(
+ FI.InnerLimit, FI.OuterLimit, DL, AC,
+ FI.OuterLoop->getLoopPreheader()->getTerminator(), DT);
+ if (OR != OverflowResult::MayOverflow)
+ return OR;
+
+ for (Value *V : FI.LinearIVUses) {
+ for (Value *U : V->users()) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+ // The IV is used as the operand of a GEP, and the IV is at least as
+ // wide as the address space of the GEP. In this case, the GEP would
+ // wrap around the address space before the IV increment wraps, which
+ // would be UB.
+ if (GEP->isInBounds() &&
+ V->getType()->getIntegerBitWidth() >=
+ DL.getPointerTypeSizeInBits(GEP->getType())) {
+ LLVM_DEBUG(
+ dbgs() << "use of linear IV would be UB if overflow occurred: ";
+ GEP->dump());
+ return OverflowResult::NeverOverflows;
+ }
+ }
+ }
+ }
+
+ return OverflowResult::MayOverflow;
+}
+
+static bool CanFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ AssumptionCache *AC, const TargetTransformInfo *TTI) {
+ SmallPtrSet<Instruction *, 8> IterationInstructions;
+ if (!findLoopComponents(FI.InnerLoop, IterationInstructions, FI.InnerInductionPHI,
+ FI.InnerLimit, FI.InnerIncrement, FI.InnerBranch, SE))
+ return false;
+ if (!findLoopComponents(FI.OuterLoop, IterationInstructions, FI.OuterInductionPHI,
+ FI.OuterLimit, FI.OuterIncrement, FI.OuterBranch, SE))
+ return false;
+
+ // Both of the loop limit values must be invariant in the outer loop
+ // (non-instructions are all inherently invariant).
+ if (!FI.OuterLoop->isLoopInvariant(FI.InnerLimit)) {
+ LLVM_DEBUG(dbgs() << "inner loop limit not invariant\n");
+ return false;
+ }
+ if (!FI.OuterLoop->isLoopInvariant(FI.OuterLimit)) {
+ LLVM_DEBUG(dbgs() << "outer loop limit not invariant\n");
+ return false;
+ }
+
+ if (!checkPHIs(FI, TTI))
+ return false;
+
+ // FIXME: it should be possible to handle different types correctly.
+ if (FI.InnerInductionPHI->getType() != FI.OuterInductionPHI->getType())
+ return false;
+
+ if (!checkOuterLoopInsts(FI, IterationInstructions, TTI))
+ return false;
+
+ // Find the values in the loop that can be replaced with the linearized
+ // induction variable, and check that there are no other uses of the inner
+ // or outer induction variable. If there were, we could still do this
+ // transformation, but we'd have to insert a div/mod to calculate the
+ // original IVs, so it wouldn't be profitable.
+ if (!checkIVUsers(FI))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "CanFlattenLoopPair: OK\n");
+ return true;
+}
+
+static bool DoFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ AssumptionCache *AC,
+ const TargetTransformInfo *TTI) {
+ Function *F = FI.OuterLoop->getHeader()->getParent();
+ LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
+ {
+ using namespace ore;
+ OptimizationRemark Remark(DEBUG_TYPE, "Flattened", FI.InnerLoop->getStartLoc(),
+ FI.InnerLoop->getHeader());
+ OptimizationRemarkEmitter ORE(F);
+ Remark << "Flattened into outer loop";
+ ORE.emit(Remark);
+ }
+
+ Value *NewTripCount =
+ BinaryOperator::CreateMul(FI.InnerLimit, FI.OuterLimit, "flatten.tripcount",
+ FI.OuterLoop->getLoopPreheader()->getTerminator());
+ LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
+ NewTripCount->dump());
+
+ // Fix up PHI nodes that take values from the inner loop back-edge, which
+ // we are about to remove.
+ FI.InnerInductionPHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
+
+ // The old Phi will be optimised away later, but for now we can't leave
+ // leave it in an invalid state, so are updating them too.
+ for (PHINode *PHI : FI.InnerPHIsToTransform)
+ PHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
+
+ // Modify the trip count of the outer loop to be the product of the two
+ // trip counts.
+ cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount);
+
+ // Replace the inner loop backedge with an unconditional branch to the exit.
+ BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
+ BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
+ InnerExitingBlock->getTerminator()->eraseFromParent();
+ BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+ DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
+
+ // Replace all uses of the polynomial calculated from the two induction
+ // variables with the one new one.
+ IRBuilder<> Builder(FI.OuterInductionPHI->getParent()->getTerminator());
+ for (Value *V : FI.LinearIVUses) {
+ Value *OuterValue = FI.OuterInductionPHI;
+ if (FI.Widened)
+ OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
+ "flatten.trunciv");
+
+ LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
+ dbgs() << "with: "; OuterValue->dump());
+ V->replaceAllUsesWith(OuterValue);
+ }
+
+ // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
+ // deleted, and any information that have about the outer loop invalidated.
+ SE->forgetLoop(FI.OuterLoop);
+ SE->forgetLoop(FI.InnerLoop);
+ LI->erase(FI.InnerLoop);
+ return true;
+}
+
+static bool CanWidenIV(struct FlattenInfo &FI, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ AssumptionCache *AC, const TargetTransformInfo *TTI) {
+ if (!WidenIV) {
+ LLVM_DEBUG(dbgs() << "Widening the IVs is disabled\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Try widening the IVs\n");
+ Module *M = FI.InnerLoop->getHeader()->getParent()->getParent();
+ auto &DL = M->getDataLayout();
+ auto *InnerType = FI.InnerInductionPHI->getType();
+ auto *OuterType = FI.OuterInductionPHI->getType();
+ unsigned MaxLegalSize = DL.getLargestLegalIntTypeSizeInBits();
+ auto *MaxLegalType = DL.getLargestLegalIntType(M->getContext());
+
+ // If both induction types are less than the maximum legal integer width,
+ // promote both to the widest type available so we know calculating
+ // (OuterLimit * InnerLimit) as the new trip count is safe.
+ if (InnerType != OuterType ||
+ InnerType->getScalarSizeInBits() >= MaxLegalSize ||
+ MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
+ LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
+ return false;
+ }
+
+ SCEVExpander Rewriter(*SE, DL, "loopflatten");
+ SmallVector<WideIVInfo, 2> WideIVs;
+ SmallVector<WeakTrackingVH, 4> DeadInsts;
+ WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
+ WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
+ unsigned ElimExt;
+ unsigned Widened;
+
+ for (unsigned i = 0; i < WideIVs.size(); i++) {
+ PHINode *WidePhi = createWideIV(WideIVs[i], LI, SE, Rewriter, DT, DeadInsts,
+ ElimExt, Widened, true /* HasGuards */,
+ true /* UsePostIncrementRanges */);
+ if (!WidePhi)
+ return false;
+ LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
+ LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIVs[i].NarrowIV->dump());
+ RecursivelyDeleteDeadPHINode(WideIVs[i].NarrowIV);
+ }
+ // After widening, rediscover all the loop components.
+ assert(Widened && "Widenend IV expected");
+ FI.Widened = true;
+ return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+}
+
+static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ AssumptionCache *AC,
+ const TargetTransformInfo *TTI) {
+ LLVM_DEBUG(
+ dbgs() << "Loop flattening running on outer loop "
+ << FI.OuterLoop->getHeader()->getName() << " and inner loop "
+ << FI.InnerLoop->getHeader()->getName() << " in "
+ << FI.OuterLoop->getHeader()->getParent()->getName() << "\n");
+
+ if (!CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI))
+ return false;
+
+ // Check if we can widen the induction variables to avoid overflow checks.
+ if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
+ return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+
+ // Check if the new iteration variable might overflow. In this case, we
+ // need to version the loop, and select the original version at runtime if
+ // the iteration space is too large.
+ // TODO: We currently don't version the loop.
+ OverflowResult OR = checkOverflow(FI, DT, AC);
+ if (OR == OverflowResult::AlwaysOverflowsHigh ||
+ OR == OverflowResult::AlwaysOverflowsLow) {
+ LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
+ return false;
+ } else if (OR == OverflowResult::MayOverflow) {
+ LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
+ return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+}
+
+bool Flatten(DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
+ AssumptionCache *AC, TargetTransformInfo *TTI) {
+ bool Changed = false;
+ for (auto *InnerLoop : LI->getLoopsInPreorder()) {
+ auto *OuterLoop = InnerLoop->getParentLoop();
+ if (!OuterLoop)
+ continue;
+ struct FlattenInfo FI(OuterLoop, InnerLoop);
+ Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+ }
+ return Changed;
+}
+
+PreservedAnalyses LoopFlattenPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *LI = &AM.getResult<LoopAnalysis>(F);
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ if (!Flatten(DT, LI, SE, AC, TTI))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+class LoopFlattenLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopFlattenLegacyPass() : FunctionPass(ID) {
+ initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // Possibly flatten loop L into its child.
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ getLoopAnalysisUsage(AU);
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addPreserved<AssumptionCacheTracker>();
+ }
+};
+} // namespace
+
+char LoopFlattenLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+ false, false)
+
+FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
+
+bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
+ auto *TTI = &TTIP.getTTI(F);
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ return Flatten(DT, LI, SE, AC, TTI);
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 20edc8699d79..b5f8dfa9aafb 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -46,6 +46,7 @@
#include "llvm/Transforms/Scalar/LoopFuse.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -53,6 +54,7 @@
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Verifier.h"
#include "llvm/InitializePasses.h"
@@ -64,6 +66,7 @@
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
using namespace llvm;
@@ -114,6 +117,11 @@ static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
"Use all available analyses")),
cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
+static cl::opt<unsigned> FusionPeelMaxCount(
+ "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
+ cl::desc("Max number of iterations to be peeled from a loop, such that "
+ "fusion can take place"));
+
#ifndef NDEBUG
static cl::opt<bool>
VerboseFusionDebugging("loop-fusion-verbose-debug",
@@ -157,6 +165,12 @@ struct FusionCandidate {
bool Valid;
/// Guard branch of the loop, if it exists
BranchInst *GuardBranch;
+ /// Peeling Paramaters of the Loop.
+ TTI::PeelingPreferences PP;
+ /// Can you Peel this Loop?
+ bool AbleToPeel;
+ /// Has this loop been Peeled
+ bool Peeled;
/// Dominator and PostDominator trees are needed for the
/// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -168,11 +182,13 @@ struct FusionCandidate {
OptimizationRemarkEmitter &ORE;
FusionCandidate(Loop *L, const DominatorTree *DT,
- const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
+ const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
+ TTI::PeelingPreferences PP)
: Preheader(L->getLoopPreheader()), Header(L->getHeader()),
ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
Latch(L->getLoopLatch()), L(L), Valid(true),
- GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) {
+ GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
+ Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
// Walk over all blocks in the loop and check for conditions that may
// prevent fusion. For each block, walk over all instructions and collect
@@ -243,6 +259,17 @@ struct FusionCandidate {
return Preheader;
}
+ /// After Peeling the loop is modified quite a bit, hence all of the Blocks
+ /// need to be updated accordingly.
+ void updateAfterPeeling() {
+ Preheader = L->getLoopPreheader();
+ Header = L->getHeader();
+ ExitingBlock = L->getExitingBlock();
+ ExitBlock = L->getExitBlock();
+ Latch = L->getLoopLatch();
+ verify();
+ }
+
/// Given a guarded loop, get the successor of the guard that is not in the
/// loop.
///
@@ -254,6 +281,8 @@ struct FusionCandidate {
assert(GuardBranch && "Only valid on guarded loops.");
assert(GuardBranch->isConditional() &&
"Expecting guard to be a conditional branch.");
+ if (Peeled)
+ return GuardBranch->getSuccessor(1);
return (GuardBranch->getSuccessor(0) == Preheader)
? GuardBranch->getSuccessor(1)
: GuardBranch->getSuccessor(0);
@@ -515,13 +544,17 @@ private:
ScalarEvolution &SE;
PostDominatorTree &PDT;
OptimizationRemarkEmitter &ORE;
+ AssumptionCache &AC;
+
+ const TargetTransformInfo &TTI;
public:
LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
ScalarEvolution &SE, PostDominatorTree &PDT,
- OptimizationRemarkEmitter &ORE, const DataLayout &DL)
+ OptimizationRemarkEmitter &ORE, const DataLayout &DL,
+ AssumptionCache &AC, const TargetTransformInfo &TTI)
: LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
- DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {}
+ DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
/// This is the main entry point for loop fusion. It will traverse the
/// specified function and collect candidate loops to fuse, starting at the
@@ -606,7 +639,9 @@ private:
/// Flow Equivalent sets, sorted by dominance.
void collectFusionCandidates(const LoopVector &LV) {
for (Loop *L : LV) {
- FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+ TTI::PeelingPreferences PP =
+ gatherPeelingPreferences(L, SE, TTI, None, None);
+ FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
if (!CurrCand.isEligibleForFusion(SE))
continue;
@@ -656,33 +691,133 @@ private:
/// Determine if two fusion candidates have the same trip count (i.e., they
/// execute the same number of iterations).
///
- /// Note that for now this method simply returns a boolean value because there
- /// are no mechanisms in loop fusion to handle different trip counts. In the
- /// future, this behaviour can be extended to adjust one of the loops to make
- /// the trip counts equal (e.g., loop peeling). When this is added, this
- /// interface may need to change to return more information than just a
- /// boolean value.
- bool identicalTripCounts(const FusionCandidate &FC0,
- const FusionCandidate &FC1) const {
+ /// This function will return a pair of values. The first is a boolean,
+ /// stating whether or not the two candidates are known at compile time to
+ /// have the same TripCount. The second is the difference in the two
+ /// TripCounts. This information can be used later to determine whether or not
+ /// peeling can be performed on either one of the candiates.
+ std::pair<bool, Optional<unsigned>>
+ haveIdenticalTripCounts(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+
const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
if (isa<SCEVCouldNotCompute>(TripCount0)) {
UncomputableTripCount++;
LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
- return false;
+ return {false, None};
}
const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
if (isa<SCEVCouldNotCompute>(TripCount1)) {
UncomputableTripCount++;
LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
- return false;
+ return {false, None};
}
+
LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
<< *TripCount1 << " are "
<< (TripCount0 == TripCount1 ? "identical" : "different")
<< "\n");
- return (TripCount0 == TripCount1);
+ if (TripCount0 == TripCount1)
+ return {true, 0};
+
+ LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
+ "determining the difference between trip counts\n");
+
+ // Currently only considering loops with a single exit point
+ // and a non-constant trip count.
+ const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
+ const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
+
+ // If any of the tripcounts are zero that means that loop(s) do not have
+ // a single exit or a constant tripcount.
+ if (TC0 == 0 || TC1 == 0) {
+ LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
+ "have a constant number of iterations. Peeling "
+ "is not benefical\n");
+ return {false, None};
+ }
+
+ Optional<unsigned> Difference = None;
+ int Diff = TC0 - TC1;
+
+ if (Diff > 0)
+ Difference = Diff;
+ else {
+ LLVM_DEBUG(
+ dbgs() << "Difference is less than 0. FC1 (second loop) has more "
+ "iterations than the first one. Currently not supported\n");
+ }
+
+ LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
+ << "\n");
+
+ return {false, Difference};
+ }
+
+ void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
+ unsigned PeelCount) {
+ assert(FC0.AbleToPeel && "Should be able to peel loop");
+
+ LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
+ << " iterations of the first loop. \n");
+
+ FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true);
+ if (FC0.Peeled) {
+ LLVM_DEBUG(dbgs() << "Done Peeling\n");
+
+#ifndef NDEBUG
+ auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
+
+ assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
+ "Loops should have identical trip counts after peeling");
+#endif
+
+ FC0.PP.PeelCount += PeelCount;
+
+ // Peeling does not update the PDT
+ PDT.recalculate(*FC0.Preheader->getParent());
+
+ FC0.updateAfterPeeling();
+
+ // In this case the iterations of the loop are constant, so the first
+ // loop will execute completely (will not jump from one of
+ // the peeled blocks to the second loop). Here we are updating the
+ // branch conditions of each of the peeled blocks, such that it will
+ // branch to its successor which is not the preheader of the second loop
+ // in the case of unguarded loops, or the succesors of the exit block of
+ // the first loop otherwise. Doing this update will ensure that the entry
+ // block of the first loop dominates the entry block of the second loop.
+ BasicBlock *BB =
+ FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
+ if (BB) {
+ SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+ SmallVector<Instruction *, 8> WorkList;
+ for (BasicBlock *Pred : predecessors(BB)) {
+ if (Pred != FC0.ExitBlock) {
+ WorkList.emplace_back(Pred->getTerminator());
+ TreeUpdates.emplace_back(
+ DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
+ }
+ }
+ // Cannot modify the predecessors inside the above loop as it will cause
+ // the iterators to be nullptrs, causing memory errors.
+ for (Instruction *CurrentBranch: WorkList) {
+ BasicBlock *Succ = CurrentBranch->getSuccessor(0);
+ if (Succ == BB)
+ Succ = CurrentBranch->getSuccessor(1);
+ ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
+ }
+
+ DTU.applyUpdates(TreeUpdates);
+ DTU.flush();
+ }
+ LLVM_DEBUG(
+ dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
+ << " iterations from the first loop.\n"
+ "Both Loops have the same number of iterations now.\n");
+ }
}
/// Walk each set of control flow equivalent fusion candidates and attempt to
@@ -716,7 +851,32 @@ private:
FC0->verify();
FC1->verify();
- if (!identicalTripCounts(*FC0, *FC1)) {
+ // Check if the candidates have identical tripcounts (first value of
+ // pair), and if not check the difference in the tripcounts between
+ // the loops (second value of pair). The difference is not equal to
+ // None iff the loops iterate a constant number of times, and have a
+ // single exit.
+ std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
+ haveIdenticalTripCounts(*FC0, *FC1);
+ bool SameTripCount = IdenticalTripCountRes.first;
+ Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
+
+ // Here we are checking that FC0 (the first loop) can be peeled, and
+ // both loops have different tripcounts.
+ if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
+ if (*TCDifference > FusionPeelMaxCount) {
+ LLVM_DEBUG(dbgs()
+ << "Difference in loop trip counts: " << *TCDifference
+ << " is greater than maximum peel count specificed: "
+ << FusionPeelMaxCount << "\n");
+ } else {
+ // Dependent on peeling being performed on the first loop, and
+ // assuming all other conditions for fusion return true.
+ SameTripCount = true;
+ }
+ }
+
+ if (!SameTripCount) {
LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
"counts. Not fusing.\n");
reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -734,7 +894,7 @@ private:
// Ensure that FC0 and FC1 have identical guards.
// If one (or both) are not guarded, this check is not necessary.
if (FC0->GuardBranch && FC1->GuardBranch &&
- !haveIdenticalGuards(*FC0, *FC1)) {
+ !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
"guards. Not Fusing.\n");
reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -803,13 +963,23 @@ private:
LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
<< *FC1 << "\n");
+ FusionCandidate FC0Copy = *FC0;
+ // Peel the loop after determining that fusion is legal. The Loops
+ // will still be safe to fuse after the peeling is performed.
+ bool Peel = TCDifference && *TCDifference > 0;
+ if (Peel)
+ peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
+
// Report fusion to the Optimization Remarks.
// Note this needs to be done *before* performFusion because
// performFusion will change the original loops, making it not
// possible to identify them after fusion is complete.
- reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
+ reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
+ FuseCounter);
- FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
+ FusionCandidate FusedCand(
+ performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
+ FC0Copy.PP);
FusedCand.verify();
assert(FusedCand.isEligibleForFusion(SE) &&
"Fused candidate should be eligible for fusion!");
@@ -1086,16 +1256,17 @@ private:
return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
}
- /// Simplify the condition of the latch branch of \p FC to true, when both of
- /// its successors are the same.
+ /// Modify the latch branch of FC to be unconditional since successors of the
+ /// branch are the same.
void simplifyLatchBranch(const FusionCandidate &FC) const {
BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
if (FCLatchBranch) {
assert(FCLatchBranch->isConditional() &&
FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
"Expecting the two successors of FCLatchBranch to be the same");
- FCLatchBranch->setCondition(
- llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType()));
+ BranchInst *NewBranch =
+ BranchInst::Create(FCLatchBranch->getSuccessor(0));
+ ReplaceInstWithInst(FCLatchBranch, NewBranch);
}
}
@@ -1155,7 +1326,8 @@ private:
if (FC0.GuardBranch)
return fuseGuardedLoops(FC0, FC1);
- assert(FC1.Preheader == FC0.ExitBlock);
+ assert(FC1.Preheader ==
+ (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
assert(FC1.Preheader->size() == 1 &&
FC1.Preheader->getSingleSuccessor() == FC1.Header);
@@ -1197,15 +1369,30 @@ private:
// to FC1.Header? I think this is basically what the three sequences are
// trying to accomplish; however, doing this directly in the CFG may mean
// the DT/PDT becomes invalid
- FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
- FC1.Header);
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+ if (!FC0.Peeled) {
+ FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
+ FC1.Header);
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+ } else {
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
+
+ // Remove the ExitBlock of the first Loop (also not needed)
+ FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+ FC1.Header);
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+ FC0.ExitBlock->getTerminator()->eraseFromParent();
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+ new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+ }
// The pre-header of L1 is not necessary anymore.
- assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+ assert(pred_empty(FC1.Preheader));
FC1.Preheader->getTerminator()->eraseFromParent();
new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
TreeUpdates.emplace_back(DominatorTree::UpdateType(
@@ -1246,7 +1433,7 @@ private:
FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
- // Change the condition of FC0 latch branch to true, as both successors of
+ // Modify the latch branch of FC0 to be unconditional as both successors of
// the branch are the same.
simplifyLatchBranch(FC0);
@@ -1268,6 +1455,11 @@ private:
LI.removeBlock(FC1.Preheader);
DTU.deleteBB(FC1.Preheader);
+ if (FC0.Peeled) {
+ LI.removeBlock(FC0.ExitBlock);
+ DTU.deleteBB(FC0.ExitBlock);
+ }
+
DTU.flush();
// Is there a way to keep SE up-to-date so we don't need to forget the loops
@@ -1282,8 +1474,7 @@ private:
mergeLatch(FC0, FC1);
// Merge the loops.
- SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
- FC1.L->block_end());
+ SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
FC0.L->addBlockEntry(BB);
FC1.L->removeBlockFromLoop(BB);
@@ -1291,7 +1482,7 @@ private:
continue;
LI.changeLoopFor(BB, FC0.L);
}
- while (!FC1.L->empty()) {
+ while (!FC1.L->isInnermost()) {
const auto &ChildLoopIt = FC1.L->begin();
Loop *ChildLoop = *ChildLoopIt;
FC1.L->removeChildLoop(ChildLoopIt);
@@ -1364,10 +1555,15 @@ private:
BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+ BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
// Move instructions from the exit block of FC0 to the beginning of the exit
- // block of FC1.
- moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI);
+ // block of FC1, in the case that the FC0 loop has not been peeled. In the
+ // case that FC0 loop is peeled, then move the instructions of the successor
+ // of the FC0 Exit block to the beginning of the exit block of FC1.
+ moveInstructionsToTheBeginning(
+ (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
+ DT, PDT, DI);
// Move instructions from the guard block of FC1 to the end of the guard
// block of FC0.
@@ -1387,8 +1583,9 @@ private:
// for FC1 (where FC1 guard would have gone if FC1 was not executed).
FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
- FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
- FC1.Header);
+
+ BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
+ BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
// The guard of FC1 is not necessary anymore.
FC1.GuardBranch->eraseFromParent();
@@ -1403,9 +1600,18 @@ private:
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
- assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+ if (FC0.Peeled) {
+ // Remove the Block after the ExitBlock of FC0
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
+ FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
+ new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
+ FC0ExitBlockSuccessor);
+ }
+
+ assert(pred_empty(FC1GuardBlock) &&
"Expecting guard block to have no predecessors");
- assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+ assert(succ_empty(FC1GuardBlock) &&
"Expecting guard block to have no successors");
// Remember the phi nodes originally in the header of FC0 in order to rewire
@@ -1459,14 +1665,13 @@ private:
// TODO: In the future, we can handle non-empty exit blocks my merging any
// instructions from FC0 exit block into FC1 exit block prior to removing
// the block.
- assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
- "Expecting exit block to be empty");
+ assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
FC0.ExitBlock->getTerminator()->eraseFromParent();
new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
// Remove FC1 Preheader
// The pre-header of L1 is not necessary anymore.
- assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+ assert(pred_empty(FC1.Preheader));
FC1.Preheader->getTerminator()->eraseFromParent();
new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
TreeUpdates.emplace_back(DominatorTree::UpdateType(
@@ -1509,7 +1714,7 @@ private:
FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
- // Change the condition of FC0 latch branch to true, as both successors of
+ // Modify the latch branch of FC0 to be unconditional as both successors of
// the branch are the same.
simplifyLatchBranch(FC0);
@@ -1529,10 +1734,8 @@ private:
// All done
// Apply the updates to the Dominator Tree and cleanup.
- assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
- "FC1GuardBlock has successors!!");
- assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
- "FC1GuardBlock has predecessors!!");
+ assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
+ assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
// Update DT/PDT
DTU.applyUpdates(TreeUpdates);
@@ -1540,6 +1743,10 @@ private:
LI.removeBlock(FC1GuardBlock);
LI.removeBlock(FC1.Preheader);
LI.removeBlock(FC0.ExitBlock);
+ if (FC0.Peeled) {
+ LI.removeBlock(FC0ExitBlockSuccessor);
+ DTU.deleteBB(FC0ExitBlockSuccessor);
+ }
DTU.deleteBB(FC1GuardBlock);
DTU.deleteBB(FC1.Preheader);
DTU.deleteBB(FC0.ExitBlock);
@@ -1557,8 +1764,7 @@ private:
mergeLatch(FC0, FC1);
// Merge the loops.
- SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
- FC1.L->block_end());
+ SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
FC0.L->addBlockEntry(BB);
FC1.L->removeBlockFromLoop(BB);
@@ -1566,7 +1772,7 @@ private:
continue;
LI.changeLoopFor(BB, FC0.L);
}
- while (!FC1.L->empty()) {
+ while (!FC1.L->isInnermost()) {
const auto &ChildLoopIt = FC1.L->begin();
Loop *ChildLoop = *ChildLoopIt;
FC1.L->removeChildLoop(ChildLoopIt);
@@ -1606,6 +1812,8 @@ struct LoopFuseLegacy : public FunctionPass {
AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addRequired<DependenceAnalysisWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
@@ -1622,9 +1830,12 @@ struct LoopFuseLegacy : public FunctionPass {
auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
const DataLayout &DL = F.getParent()->getDataLayout();
- LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
+
+ LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
return LF.fuseLoops(F);
}
};
@@ -1637,9 +1848,11 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
const DataLayout &DL = F.getParent()->getDataLayout();
- LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
+
+ LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
bool Changed = LF.fuseLoops(F);
if (!Changed)
return PreservedAnalyses::all();
@@ -1662,6 +1875,8 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3cb4df12e9b0..8064c02e2b39 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -47,6 +47,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
@@ -79,6 +80,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -106,6 +108,32 @@ using namespace llvm;
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+STATISTIC(
+ NumShiftUntilBitTest,
+ "Number of uncountable loops recognized as 'shift until bitttest' idiom");
+
+bool DisableLIRP::All;
+static cl::opt<bool, true>
+ DisableLIRPAll("disable-" DEBUG_TYPE "-all",
+ cl::desc("Options to disable Loop Idiom Recognize Pass."),
+ cl::location(DisableLIRP::All), cl::init(false),
+ cl::ReallyHidden);
+
+bool DisableLIRP::Memset;
+static cl::opt<bool, true>
+ DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
+ cl::desc("Proceed with loop idiom recognize pass, but do "
+ "not convert loop(s) to memset."),
+ cl::location(DisableLIRP::Memset), cl::init(false),
+ cl::ReallyHidden);
+
+bool DisableLIRP::Memcpy;
+static cl::opt<bool, true>
+ DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
+ cl::desc("Proceed with loop idiom recognize pass, but do "
+ "not convert loop(s) to memcpy."),
+ cl::location(DisableLIRP::Memcpy), cl::init(false),
+ cl::ReallyHidden);
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
@@ -204,6 +232,8 @@ private:
const DebugLoc &DL, bool ZeroCheck,
bool IsCntPhiUsedOutsideLoop);
+ bool recognizeShiftUntilBitTest();
+
/// @}
};
@@ -217,6 +247,9 @@ public:
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (DisableLIRP::All)
+ return false;
+
if (skipLoop(L))
return false;
@@ -262,6 +295,9 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
+ if (DisableLIRP::All)
+ return PreservedAnalyses::all();
+
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
@@ -426,11 +462,6 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
if (!SI->isUnordered())
return LegalStoreKind::None;
- // Don't convert stores of non-integral pointer types to memsets (which stores
- // integers).
- if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
- return LegalStoreKind::None;
-
// Avoid merging nontemporal stores.
if (SI->getMetadata(LLVMContext::MD_nontemporal))
return LegalStoreKind::None;
@@ -438,9 +469,17 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
Value *StoredVal = SI->getValueOperand();
Value *StorePtr = SI->getPointerOperand();
+ // Don't convert stores of non-integral pointer types to memsets (which stores
+ // integers).
+ if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+ return LegalStoreKind::None;
+
// Reject stores that are so large that they overflow an unsigned.
- uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
- if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+ // When storing out scalable vectors we bail out for now, since the code
+ // below currently only works for constant strides.
+ TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+ if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
+ (SizeInBits.getFixedSize() >> 32) != 0)
return LegalStoreKind::None;
// See if the pointer expression is an AddRec like {base,+,1} on the current
@@ -469,13 +508,13 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
// If we're allowed to form a memset, and the stored value would be
// acceptable for memset, use it.
- if (!UnorderedAtomic && HasMemset && SplatValue &&
+ if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
// Verify that the stored value is loop invariant. If not, we can't
// promote the memset.
CurLoop->isLoopInvariant(SplatValue)) {
// It looks like we can use SplatValue.
return LegalStoreKind::Memset;
- } else if (!UnorderedAtomic && HasMemsetPattern &&
+ } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
// Don't create memset_pattern16s with address spaces.
StorePtr->getType()->getPointerAddressSpace() == 0 &&
(PatternValue = getMemSetPatternValue(StoredVal, DL))) {
@@ -484,7 +523,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
}
// Otherwise, see if the store can be turned into a memcpy.
- if (HasMemcpy) {
+ if (HasMemcpy && !DisableLIRP::Memcpy) {
// Check to see if the stride matches the size of the store. If so, then we
// know that every byte is touched in the loop.
APInt Stride = getStoreStride(StoreEv);
@@ -539,12 +578,12 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
break;
case LegalStoreKind::Memset: {
// Find the base pointer.
- Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
StoreRefsForMemset[Ptr].push_back(SI);
} break;
case LegalStoreKind::MemsetPattern: {
// Find the base pointer.
- Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
StoreRefsForMemsetPattern[Ptr].push_back(SI);
} break;
case LegalStoreKind::Memcpy:
@@ -812,7 +851,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
// Get the location that may be stored across the loop. Since the access is
// strided positively through memory, we say that the modified location starts
// at the pointer and has infinite size.
- LocationSize AccessSize = LocationSize::unknown();
+ LocationSize AccessSize = LocationSize::afterPointer();
// If the loop iterates a fixed number of times, we can refine the access size
// to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -864,8 +903,8 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
// If we're going to need to zero extend the BE count, check if we can add
// one to it prior to zero extending without overflow. Provided this is safe,
// it allows better simplification of the +1.
- if (DL->getTypeSizeInBits(BECount->getType()) <
- DL->getTypeSizeInBits(IntPtr) &&
+ if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
+ DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
SE->isLoopEntryGuardedByCond(
CurLoop, ICmpInst::ICMP_NE, BECount,
SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
@@ -908,10 +947,12 @@ bool LoopIdiomRecognize::processLoopStridedStore(
BasicBlock *Preheader = CurLoop->getLoopPreheader();
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
+ SCEVExpanderCleaner ExpCleaner(Expander, *DT);
Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
+ bool Changed = false;
const SCEV *Start = Ev->getStart();
// Handle negative strided loops.
if (NegStride)
@@ -920,7 +961,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
if (!isSafeToExpand(Start, *SE))
- return false;
+ return Changed;
// Okay, we have a strided store "p[i]" of a splattable value. We can turn
// this into a memset in the loop preheader now if we want. However, this
@@ -929,16 +970,22 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// base pointer and checking the region.
Value *BasePtr =
Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+
+ // From here on out, conservatively report to the pass manager that we've
+ // changed the IR, even if we later clean up these added instructions. There
+ // may be structural differences e.g. in the order of use lists not accounted
+ // for in just a textual dump of the IR. This is written as a variable, even
+ // though statically all the places this dominates could be replaced with
+ // 'true', with the hope that anyone trying to be clever / "more precise" with
+ // the return value will read this comment, and leave them alone.
+ Changed = true;
+
if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
- StoreSize, *AA, Stores)) {
- Expander.clear();
- // If we generated new code for the base pointer, clean up.
- RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
- return false;
- }
+ StoreSize, *AA, Stores))
+ return Changed;
if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
- return false;
+ return Changed;
// Okay, everything looks good, insert the memset.
@@ -948,7 +995,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
if (!isSafeToExpand(NumBytesS, *SE))
- return false;
+ return Changed;
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
@@ -1007,32 +1054,10 @@ bool LoopIdiomRecognize::processLoopStridedStore(
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
++NumMemSet;
+ ExpCleaner.markResultUsed();
return true;
}
-class ExpandedValuesCleaner {
- SCEVExpander &Expander;
- TargetLibraryInfo *TLI;
- SmallVector<Value *, 4> ExpandedValues;
- bool Commit = false;
-
-public:
- ExpandedValuesCleaner(SCEVExpander &Expander, TargetLibraryInfo *TLI)
- : Expander(Expander), TLI(TLI) {}
-
- void add(Value *V) { ExpandedValues.push_back(V); }
-
- void commit() { Commit = true; }
-
- ~ExpandedValuesCleaner() {
- if (!Commit) {
- Expander.clear();
- for (auto *V : ExpandedValues)
- RecursivelyDeleteTriviallyDeadInstructions(V, TLI);
- }
- }
-};
-
/// If the stored value is a strided load in the same loop with the same stride
/// this may be transformable into a memcpy. This kicks in for stuff like
/// for (i) A[i] = B[i];
@@ -1063,8 +1088,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
- ExpandedValuesCleaner EVC(Expander, TLI);
+ SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+ bool Changed = false;
const SCEV *StrStart = StoreEv->getStart();
unsigned StrAS = SI->getPointerAddressSpace();
Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
@@ -1081,13 +1107,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
// checking everything.
Value *StoreBasePtr = Expander.expandCodeFor(
StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
- EVC.add(StoreBasePtr);
+
+ // From here on out, conservatively report to the pass manager that we've
+ // changed the IR, even if we later clean up these added instructions. There
+ // may be structural differences e.g. in the order of use lists not accounted
+ // for in just a textual dump of the IR. This is written as a variable, even
+ // though statically all the places this dominates could be replaced with
+ // 'true', with the hope that anyone trying to be clever / "more precise" with
+ // the return value will read this comment, and leave them alone.
+ Changed = true;
SmallPtrSet<Instruction *, 1> Stores;
Stores.insert(SI);
if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSize, *AA, Stores))
- return false;
+ return Changed;
const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = LI->getPointerAddressSpace();
@@ -1100,14 +1134,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
// mutated by the loop.
Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
- EVC.add(LoadBasePtr);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
StoreSize, *AA, Stores))
- return false;
+ return Changed;
if (avoidLIRForMultiBlockLoop())
- return false;
+ return Changed;
// Okay, everything is safe, we can transform this!
@@ -1116,7 +1149,6 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
- EVC.add(NumBytes);
CallInst *NewCall = nullptr;
// Check whether to generate an unordered atomic memcpy:
@@ -1131,14 +1163,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
const Align StoreAlign = SI->getAlign();
const Align LoadAlign = LI->getAlign();
if (StoreAlign < StoreSize || LoadAlign < StoreSize)
- return false;
+ return Changed;
// If the element.atomic memcpy is not lowered into explicit
// loads/stores later, then it will be lowered into an element-size
// specific lib call. If the lib call doesn't exist for our store size, then
// we shouldn't generate the memcpy.
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
- return false;
+ return Changed;
// Create the call.
// Note that unordered atomic loads/stores are *required* by the spec to
@@ -1176,7 +1208,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
++NumMemCpy;
- EVC.commit();
+ ExpCleaner.markResultUsed();
return true;
}
@@ -1186,7 +1218,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
bool IsLoopMemset) {
if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
- if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
+ if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")
<< " avoided: multi-block top-level loop\n");
@@ -1203,7 +1235,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
<< "] Noncountable Loop %"
<< CurLoop->getHeader()->getName() << "\n");
- return recognizePopcount() || recognizeAndInsertFFS();
+ return recognizePopcount() || recognizeAndInsertFFS() ||
+ recognizeShiftUntilBitTest();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1450,6 +1483,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
return false;
// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+ // or cnt.next = cnt + -1.
// TODO: We can skip the step. If loop trip count is known (CTLZ),
// then all uses of "cnt.next" could be optimized to the trip count
// plus "cnt0". Currently it is not optimized.
@@ -1463,7 +1497,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
continue;
ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
- if (!Inc || !Inc->isOne())
+ if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
continue;
PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
@@ -1692,11 +1726,13 @@ void LoopIdiomRecognize::transformLoopToCountable(
// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
IRBuilder<> Builder(PreheaderBr);
Builder.SetCurrentDebugLocation(DL);
- Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
// Count = BitWidth - CTLZ(InitX);
+ // NewCount = Count;
// If there are uses of CntPhi create:
- // CountPrev = BitWidth - CTLZ(InitX >> 1);
+ // NewCount = BitWidth - CTLZ(InitX >> 1);
+ // Count = NewCount + 1;
+ Value *InitXNext;
if (IsCntPhiUsedOutsideLoop) {
if (DefX->getOpcode() == Instruction::AShr)
InitXNext =
@@ -1711,27 +1747,31 @@ void LoopIdiomRecognize::transformLoopToCountable(
llvm_unreachable("Unexpected opcode!");
} else
InitXNext = InitX;
- FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
- Count = Builder.CreateSub(
- ConstantInt::get(FFS->getType(),
- FFS->getType()->getIntegerBitWidth()),
+ Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
+ Value *Count = Builder.CreateSub(
+ ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
FFS);
+ Value *NewCount = Count;
if (IsCntPhiUsedOutsideLoop) {
- CountPrev = Count;
- Count = Builder.CreateAdd(
- CountPrev,
- ConstantInt::get(CountPrev->getType(), 1));
+ NewCount = Count;
+ Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
}
- NewCount = Builder.CreateZExtOrTrunc(
- IsCntPhiUsedOutsideLoop ? CountPrev : Count,
- cast<IntegerType>(CntInst->getType()));
+ NewCount = Builder.CreateZExtOrTrunc(NewCount,
+ cast<IntegerType>(CntInst->getType()));
- // If the counter's initial value is not zero, insert Add Inst.
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
- ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
- if (!InitConst || !InitConst->isZero())
- NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+ if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
+ // If the counter was being incremented in the loop, add NewCount to the
+ // counter's initial value, but only if the initial value is not zero.
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+ } else {
+ // If the count was being decremented in the loop, subtract NewCount from
+ // the counter's initial value.
+ NewCount = Builder.CreateSub(CntInitVal, NewCount);
+ }
// Step 2: Insert new IV and loop condition:
// loop:
@@ -1879,3 +1919,343 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
// loop. The loop would otherwise not be deleted even if it becomes empty.
SE->forgetLoop(CurLoop);
}
+
+/// Match loop-invariant value.
+template <typename SubPattern_t> struct match_LoopInvariant {
+ SubPattern_t SubPattern;
+ const Loop *L;
+
+ match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
+ : SubPattern(SP), L(L) {}
+
+ template <typename ITy> bool match(ITy *V) {
+ return L->isLoopInvariant(V) && SubPattern.match(V);
+ }
+};
+
+/// Matches if the value is loop-invariant.
+template <typename Ty>
+inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
+ return match_LoopInvariant<Ty>(M, L);
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// entry:
+/// <...>
+/// %bitmask = shl i32 1, %bitpos
+/// br label %loop
+///
+/// loop:
+/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
+/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+/// %x.next = shl i32 %x.curr, 1
+/// <...>
+/// br i1 %x.curr.isbitunset, label %loop, label %end
+///
+/// end:
+/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
+/// <...>
+/// \endcode
+static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
+ Value *&BitMask, Value *&BitPos,
+ Value *&CurrX, Instruction *&NextX) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE
+ " Performing shift-until-bittest idiom detection.\n");
+
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
+ return false;
+ }
+
+ BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+ BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
+ assert(LoopPreheaderBB && "There is always a loop preheader.");
+
+ using namespace PatternMatch;
+
+ // Step 1: Check if the loop backedge is in desirable form.
+
+ ICmpInst::Predicate Pred;
+ Value *CmpLHS, *CmpRHS;
+ BasicBlock *TrueBB, *FalseBB;
+ if (!match(LoopHeaderBB->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
+ m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
+ return false;
+ }
+
+ // Step 2: Check if the backedge's condition is in desirable form.
+
+ auto MatchVariableBitMask = [&]() {
+ return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
+ match(CmpLHS,
+ m_c_And(m_Value(CurrX),
+ m_CombineAnd(
+ m_Value(BitMask),
+ m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
+ CurLoop))));
+ };
+ auto MatchConstantBitMask = [&]() {
+ return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
+ match(CmpLHS, m_And(m_Value(CurrX),
+ m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
+ (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
+ };
+ auto MatchDecomposableConstantBitMask = [&]() {
+ APInt Mask;
+ return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
+ ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
+ (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
+ (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
+ };
+
+ if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
+ !MatchDecomposableConstantBitMask()) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
+ return false;
+ }
+
+ // Step 3: Check if the recurrence is in desirable form.
+ auto *CurrXPN = dyn_cast<PHINode>(CurrX);
+ if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
+ return false;
+ }
+
+ BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
+ NextX =
+ dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
+
+ if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
+ // FIXME: support right-shift?
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
+ return false;
+ }
+
+ // Step 4: Check if the backedge's destinations are in desirable form.
+
+ assert(ICmpInst::isEquality(Pred) &&
+ "Should only get equality predicates here.");
+
+ // cmp-br is commutative, so canonicalize to a single variant.
+ if (Pred != ICmpInst::Predicate::ICMP_EQ) {
+ Pred = ICmpInst::getInversePredicate(Pred);
+ std::swap(TrueBB, FalseBB);
+ }
+
+ // We expect to exit loop when comparison yields false,
+ // so when it yields true we should branch back to loop header.
+ if (TrueBB != LoopHeaderBB) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
+ return false;
+ }
+
+ // Okay, idiom checks out.
+ return true;
+}
+
+/// Look for the following loop:
+/// \code
+/// entry:
+/// <...>
+/// %bitmask = shl i32 1, %bitpos
+/// br label %loop
+///
+/// loop:
+/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
+/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+/// %x.next = shl i32 %x.curr, 1
+/// <...>
+/// br i1 %x.curr.isbitunset, label %loop, label %end
+///
+/// end:
+/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
+/// <...>
+/// \endcode
+///
+/// And transform it into:
+/// \code
+/// entry:
+/// %bitmask = shl i32 1, %bitpos
+/// %lowbitmask = add i32 %bitmask, -1
+/// %mask = or i32 %lowbitmask, %bitmask
+/// %x.masked = and i32 %x, %mask
+/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
+/// i1 true)
+/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
+/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
+/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
+/// %tripcount = add i32 %backedgetakencount, 1
+/// %x.curr = shl i32 %x, %backedgetakencount
+/// %x.next = shl i32 %x, %tripcount
+/// br label %loop
+///
+/// loop:
+/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
+/// %loop.iv.next = add nuw i32 %loop.iv, 1
+/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
+/// <...>
+/// br i1 %loop.ivcheck, label %end, label %loop
+///
+/// end:
+/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
+/// <...>
+/// \endcode
+bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
+ bool MadeChange = false;
+
+ Value *X, *BitMask, *BitPos, *XCurr;
+ Instruction *XNext;
+ if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
+ XNext)) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE
+ " shift-until-bittest idiom detection failed.\n");
+ return MadeChange;
+ }
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
+
+ // Ok, it is the idiom we were looking for, we *could* transform this loop,
+ // but is it profitable to transform?
+
+ BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+ BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
+ assert(LoopPreheaderBB && "There is always a loop preheader.");
+
+ BasicBlock *SuccessorBB = CurLoop->getExitBlock();
+ assert(LoopPreheaderBB && "There is only a single successor.");
+
+ IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
+ Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
+
+ Intrinsic::ID IntrID = Intrinsic::ctlz;
+ Type *Ty = X->getType();
+
+ TargetTransformInfo::TargetCostKind CostKind =
+ TargetTransformInfo::TCK_SizeAndLatency;
+
+ // The rewrite is considered to be unprofitable iff and only iff the
+ // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
+ // making the loop countable, even if nothing else changes.
+ IntrinsicCostAttributes Attrs(
+ IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
+ int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
+ if (Cost > TargetTransformInfo::TCC_Basic) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE
+ " Intrinsic is too costly, not beneficial\n");
+ return MadeChange;
+ }
+ if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
+ TargetTransformInfo::TCC_Basic) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
+ return MadeChange;
+ }
+
+ // Ok, transform appears worthwhile.
+ MadeChange = true;
+
+ // Step 1: Compute the loop trip count.
+
+ Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
+ BitPos->getName() + ".lowbitmask");
+ Value *Mask =
+ Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
+ Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
+ CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
+ IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
+ /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
+ Value *XMaskedNumActiveBits = Builder.CreateSub(
+ ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
+ XMasked->getName() + ".numactivebits");
+ Value *XMaskedLeadingOnePos =
+ Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
+ XMasked->getName() + ".leadingonepos");
+
+ Value *LoopBackedgeTakenCount = Builder.CreateSub(
+ BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount");
+ // We know loop's backedge-taken count, but what's loop's trip count?
+ // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
+ Value *LoopTripCount =
+ Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
+ CurLoop->getName() + ".tripcount");
+
+ // Step 2: Compute the recurrence's final value without a loop.
+
+ // NewX is always safe to compute, because `LoopBackedgeTakenCount`
+ // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
+ Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
+ NewX->takeName(XCurr);
+ if (auto *I = dyn_cast<Instruction>(NewX))
+ I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
+
+ Value *NewXNext;
+ // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
+ // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
+ // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
+ // that isn't the case, we'll need to emit an alternative, safe IR.
+ if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
+ PatternMatch::match(
+ BitPos, PatternMatch::m_SpecificInt_ICMP(
+ ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
+ Ty->getScalarSizeInBits() - 1))))
+ NewXNext = Builder.CreateShl(X, LoopTripCount);
+ else {
+ // Otherwise, just additionally shift by one. It's the smallest solution,
+ // alternatively, we could check that NewX is INT_MIN (or BitPos is )
+ // and select 0 instead.
+ NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
+ }
+
+ NewXNext->takeName(XNext);
+ if (auto *I = dyn_cast<Instruction>(NewXNext))
+ I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
+
+ // Step 3: Adjust the successor basic block to recieve the computed
+ // recurrence's final value instead of the recurrence itself.
+
+ XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
+ XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
+
+ // Step 4: Rewrite the loop into a countable form, with canonical IV.
+
+ // The new canonical induction variable.
+ Builder.SetInsertPoint(&LoopHeaderBB->front());
+ auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
+
+ // The induction itself.
+ // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
+ Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
+ auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1),
+ IV->getName() + ".next");
+
+ // The loop trip count check.
+ auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
+ CurLoop->getName() + ".ivcheck");
+ Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ LoopHeaderBB->getTerminator()->eraseFromParent();
+
+ // Populate the IV PHI.
+ IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
+ IV->addIncoming(IVNext, LoopHeaderBB);
+
+ // Step 5: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+
+ SE->forgetLoop(CurLoop);
+
+ // Other passes will take care of actually deleting the loop if possible.
+
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
+
+ ++NumShiftUntilBitTest;
+ return MadeChange;
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 7787c0bccd4c..d9dbc0deb42a 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopInterchange.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -27,6 +28,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -427,9 +429,7 @@ private:
const LoopInterchangeLegality &LIL;
};
-// Main LoopInterchange Pass.
-struct LoopInterchange : public LoopPass {
- static char ID;
+struct LoopInterchange {
ScalarEvolution *SE = nullptr;
LoopInfo *LI = nullptr;
DependenceInfo *DI = nullptr;
@@ -438,34 +438,21 @@ struct LoopInterchange : public LoopPass {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- LoopInterchange() : LoopPass(ID) {
- initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DependenceAnalysisWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
+ DominatorTree *DT, OptimizationRemarkEmitter *ORE)
+ : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
- getLoopAnalysisUsage(AU);
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L) || L->getParentLoop())
+ bool run(Loop *L) {
+ if (L->getParentLoop())
return false;
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
return processLoopList(populateWorklist(*L));
}
bool isComputableLoopNest(LoopVector LoopList) {
for (Loop *L : LoopList) {
const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
- if (ExitCountOuter == SE->getCouldNotCompute()) {
+ if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
return false;
}
@@ -624,6 +611,13 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
containsUnsafeInstructions(OuterLoopLatch))
return false;
+ // Also make sure the inner loop preheader does not contain any unsafe
+ // instructions. Note that all instructions in the preheader will be moved to
+ // the outer loop header when interchanging.
+ if (InnerLoopPreHeader != OuterLoopHeader &&
+ containsUnsafeInstructions(InnerLoopPreHeader))
+ return false;
+
LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
// We have a perfect loop nest.
return true;
@@ -667,6 +661,10 @@ static Value *followLCSSA(Value *SV) {
// Check V's users to see if it is involved in a reduction in L.
static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+ // Reduction variables cannot be constants.
+ if (isa<Constant>(V))
+ return nullptr;
+
for (Value *User : V->users()) {
if (PHINode *PHI = dyn_cast<PHINode>(User)) {
if (PHI->getNumIncomingValues() == 1)
@@ -707,8 +705,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
if (!InnerRedPhi ||
- !llvm::any_of(InnerRedPhi->incoming_values(),
- [&PHI](Value *V) { return V == &PHI; })) {
+ !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
LLVM_DEBUG(
dbgs()
<< "Failed to recognize PHI as an induction or reduction.\n");
@@ -1045,6 +1042,10 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
bool FoundInnerInduction = false;
bool FoundOuterInduction = false;
for (unsigned i = 0; i < NumOp; ++i) {
+ // Skip operands that are not SCEV-able.
+ if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
+ continue;
+
const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
if (!AR)
@@ -1189,7 +1190,7 @@ void LoopInterchangeTransform::restructureLoops(
removeChildLoop(NewInner, NewOuter);
LI->changeTopLevelLoop(NewInner, NewOuter);
}
- while (!NewOuter->empty())
+ while (!NewOuter->isInnermost())
NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
NewOuter->addChildLoop(NewInner);
@@ -1305,6 +1306,21 @@ bool LoopInterchangeTransform::transform() {
LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
}
+ // Instructions in the original inner loop preheader may depend on values
+ // defined in the outer loop header. Move them there, because the original
+ // inner loop preheader will become the entry into the interchanged loop nest.
+ // Currently we move all instructions and rely on LICM to move invariant
+ // instructions outside the loop nest.
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ if (InnerLoopPreHeader != OuterLoopHeader) {
+ SmallPtrSet<Instruction *, 4> NeedsMoving;
+ for (Instruction &I :
+ make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
+ std::prev(InnerLoopPreHeader->end()))))
+ I.moveBefore(OuterLoopHeader->getTerminator());
+ }
+
Transformed |= adjustLoopLinks();
if (!Transformed) {
LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
@@ -1521,8 +1537,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
// The outer loop header might or might not branch to the outer latch.
// We are guaranteed to branch to the inner loop preheader.
- if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI),
- OuterLoopLatch) != succ_end(OuterLoopHeaderBI))
+ if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch))
updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
/*MustUpdateOnce=*/false);
updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
@@ -1569,9 +1584,9 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
// Now update the reduction PHIs in the inner and outer loop headers.
SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
- for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+ for (PHINode &PHI : drop_begin(InnerLoopHeader->phis()))
InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
- for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+ for (PHINode &PHI : drop_begin(OuterLoopHeader->phis()))
OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
auto &OuterInnerReductions = LIL.getOuterInnerReductions();
@@ -1595,6 +1610,17 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
+ // Values defined in the outer loop header could be used in the inner loop
+ // latch. In that case, we need to create LCSSA phis for them, because after
+ // interchanging they will be defined in the new inner loop and used in the
+ // new outer loop.
+ IRBuilder<> Builder(OuterLoopHeader->getContext());
+ SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
+ for (Instruction &I :
+ make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
+ MayNeedLCSSAPhis.push_back(&I);
+ formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
+
return true;
}
@@ -1612,15 +1638,58 @@ bool LoopInterchangeTransform::adjustLoopLinks() {
return Changed;
}
-char LoopInterchange::ID = 0;
+/// Main LoopInterchange Pass.
+struct LoopInterchangeLegacyPass : public LoopPass {
+ static char ID;
+
+ LoopInterchangeLegacyPass() : LoopPass(ID) {
+ initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DependenceAnalysisWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
+ }
+};
-INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+char LoopInterchangeLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",
"Interchanges loops for cache reuse", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",
"Interchanges loops for cache reuse", false, false)
-Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
+Pass *llvm::createLoopInterchangePass() {
+ return new LoopInterchangeLegacyPass();
+}
+
+PreservedAnalyses LoopInterchangePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function &F = *L.getHeader()->getParent();
+
+ DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+ OptimizationRemarkEmitter ORE(&F);
+ if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(&L))
+ return PreservedAnalyses::all();
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 4412b3079461..058612149a94 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -27,7 +27,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/GlobalsModRef.h"
@@ -56,6 +55,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
@@ -308,8 +308,8 @@ public:
/// We need a check if one is a pointer for a candidate load and the other is
/// a pointer for a possibly intervening store.
bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
- const SmallPtrSet<Value *, 4> &PtrsWrittenOnFwdingPath,
- const std::set<Value *> &CandLoadPtrs) {
+ const SmallPtrSetImpl<Value *> &PtrsWrittenOnFwdingPath,
+ const SmallPtrSetImpl<Value *> &CandLoadPtrs) {
Value *Ptr1 =
LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
Value *Ptr2 =
@@ -384,11 +384,9 @@ public:
findPointersWrittenOnForwardingPath(Candidates);
// Collect the pointers of the candidate loads.
- // FIXME: SmallPtrSet does not work with std::inserter.
- std::set<Value *> CandLoadPtrs;
- transform(Candidates,
- std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
- std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+ SmallPtrSet<Value *, 4> CandLoadPtrs;
+ for (const auto &Candidate : Candidates)
+ CandLoadPtrs.insert(Candidate.getLoadPtr());
const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
SmallVector<RuntimePointerCheck, 4> Checks;
@@ -488,7 +486,6 @@ public:
// Filter the candidates further.
SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
- unsigned NumForwarding = 0;
for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
LLVM_DEBUG(dbgs() << "Candidate " << Cand);
@@ -508,12 +505,17 @@ public:
if (!Cand.isDependenceDistanceOfOne(PSE, L))
continue;
- ++NumForwarding;
+ assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
+ "Loading from something other than indvar?");
+ assert(
+ isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Store->getPointerOperand())) &&
+ "Storing to something other than indvar?");
+
+ Candidates.push_back(Cand);
LLVM_DEBUG(
dbgs()
- << NumForwarding
+ << Candidates.size()
<< ". Valid store-to-load forwarding across the loop backedge\n");
- Candidates.push_back(Cand);
}
if (Candidates.empty())
return false;
@@ -561,10 +563,19 @@ public:
// Point of no-return, start the transformation. First, version the loop
// if necessary.
- LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
- LV.setAliasChecks(std::move(Checks));
- LV.setSCEVChecks(LAI.getPSE().getUnionPredicate());
+ LoopVersioning LV(LAI, Checks, L, LI, DT, PSE.getSE());
LV.versionLoop();
+
+ // After versioning, some of the candidates' pointers could stop being
+ // SCEVAddRecs. We need to filter them out.
+ auto NoLongerGoodCandidate = [this](
+ const StoreToLoadForwardingCandidate &Cand) {
+ return !isa<SCEVAddRecExpr>(
+ PSE.getSCEV(Cand.Load->getPointerOperand())) ||
+ !isa<SCEVAddRecExpr>(
+ PSE.getSCEV(Cand.Store->getPointerOperand()));
+ };
+ llvm::erase_if(Candidates, NoLongerGoodCandidate);
}
// Next, propagate the value stored by the store to the users of the load.
@@ -573,7 +584,7 @@ public:
"storeforward");
for (const auto &Cand : Candidates)
propagateStoredValueToLoadUsers(Cand, SEE);
- NumLoopLoadEliminted += NumForwarding;
+ NumLoopLoadEliminted += Candidates.size();
return true;
}
@@ -599,6 +610,7 @@ private:
static bool
eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ ScalarEvolution *SE, AssumptionCache *AC,
function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
// Build up a worklist of inner-loops to transform to avoid iterator
// invalidation.
@@ -607,15 +619,21 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
// which merely optimizes the use of loads in a loop.
SmallVector<Loop *, 8> Worklist;
+ bool Changed = false;
+
for (Loop *TopLevelLoop : LI)
- for (Loop *L : depth_first(TopLevelLoop))
+ for (Loop *L : depth_first(TopLevelLoop)) {
+ Changed |= simplifyLoop(L, &DT, &LI, SE, AC, /*MSSAU*/ nullptr, false);
// We only handle inner-most loops.
- if (L->empty())
+ if (L->isInnermost())
Worklist.push_back(L);
+ }
// Now walk the identified inner loops.
- bool Changed = false;
for (Loop *L : Worklist) {
+ // Match historical behavior
+ if (!L->isRotatedForm() || !L->getExitingBlock())
+ continue;
// The actual work is performed by LoadEliminationForLoop.
LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
Changed |= LEL.processLoop();
@@ -649,7 +667,7 @@ public:
// Process each loop nest in the function.
return eliminateLoadsAcrossLoops(
- F, LI, DT, BFI, PSI,
+ F, LI, DT, BFI, PSI, /*SE*/ nullptr, /*AC*/ nullptr,
[&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
}
@@ -706,8 +724,9 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
bool Changed = eliminateLoadsAcrossLoops(
- F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
+ F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & {
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
+ TLI, TTI, nullptr, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
});
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 98889a9df116..3fe8e7259114 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -6,74 +6,113 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Support/TimeProfiler.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/TimeProfiler.h"
using namespace llvm;
-// Explicit template instantiations and specialization defininitions for core
-// template typedefs.
namespace llvm {
-template class PassManager<Loop, LoopAnalysisManager,
- LoopStandardAnalysisResults &, LPMUpdater &>;
/// Explicitly specialize the pass manager's run method to handle loop nest
/// structure updates.
-template <>
PreservedAnalyses
PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U) {
- PreservedAnalyses PA = PreservedAnalyses::all();
if (DebugLogging)
dbgs() << "Starting Loop pass manager run.\n";
+ // Runs loop-nest passes only when the current loop is a top-level one.
+ PreservedAnalyses PA = (L.isOutermost() && !LoopNestPasses.empty())
+ ? runWithLoopNestPasses(L, AM, AR, U)
+ : runWithoutLoopNestPasses(L, AM, AR, U);
+
+ // Invalidation for the current loop should be handled above, and other loop
+ // analysis results shouldn't be impacted by runs over this loop. Therefore,
+ // the remaining analysis results in the AnalysisManager are preserved. We
+ // mark this with a set so that we don't need to inspect each one
+ // individually.
+ // FIXME: This isn't correct! This loop and all nested loops' analyses should
+ // be preserved, but unrolling should invalidate the parent loop's analyses.
+ PA.preserveSet<AllAnalysesOn<Loop>>();
+
+ if (DebugLogging)
+ dbgs() << "Finished Loop pass manager run.\n";
+
+ return PA;
+}
+
+// Run both loop passes and loop-nest passes on top-level loop \p L.
+PreservedAnalyses
+LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ assert(L.isOutermost() &&
+ "Loop-nest passes should only run on top-level loops.");
+ PreservedAnalyses PA = PreservedAnalyses::all();
+
// Request PassInstrumentation from analysis manager, will use it to run
// instrumenting callbacks for the passes later.
PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
- for (auto &Pass : Passes) {
- // Check the PassInstrumentation's BeforePass callbacks before running the
- // pass, skip its execution completely if asked to (callback returns false).
- if (!PI.runBeforePass<Loop>(*Pass, L))
- continue;
- if (DebugLogging)
- dbgs() << "Running pass: " << Pass->name() << " on " << L;
+ unsigned LoopPassIndex = 0, LoopNestPassIndex = 0;
- PreservedAnalyses PassPA;
- {
- TimeTraceScope TimeScope(Pass->name(), L.getName());
- PassPA = Pass->run(L, AM, AR, U);
+ // `LoopNestPtr` points to the `LoopNest` object for the current top-level
+ // loop and `IsLoopNestPtrValid` indicates whether the pointer is still valid.
+ // The `LoopNest` object will have to be re-constructed if the pointer is
+ // invalid when encountering a loop-nest pass.
+ std::unique_ptr<LoopNest> LoopNestPtr;
+ bool IsLoopNestPtrValid = false;
+
+ for (size_t I = 0, E = IsLoopNestPass.size(); I != E; ++I) {
+ Optional<PreservedAnalyses> PassPA;
+ if (!IsLoopNestPass[I]) {
+ // The `I`-th pass is a loop pass.
+ auto &Pass = LoopPasses[LoopPassIndex++];
+ PassPA = runSinglePass(L, Pass, AM, AR, U, PI);
+ } else {
+ // The `I`-th pass is a loop-nest pass.
+ auto &Pass = LoopNestPasses[LoopNestPassIndex++];
+
+ // If the loop-nest object calculated before is no longer valid,
+ // re-calculate it here before running the loop-nest pass.
+ if (!IsLoopNestPtrValid) {
+ LoopNestPtr = LoopNest::getLoopNest(L, AR.SE);
+ IsLoopNestPtrValid = true;
+ }
+ PassPA = runSinglePass(*LoopNestPtr, Pass, AM, AR, U, PI);
}
- // do not pass deleted Loop into the instrumentation
- if (U.skipCurrentLoop())
- PI.runAfterPassInvalidated<Loop>(*Pass);
- else
- PI.runAfterPass<Loop>(*Pass, L);
+ // `PassPA` is `None` means that the before-pass callbacks in
+ // `PassInstrumentation` return false. The pass does not run in this case,
+ // so we can skip the following procedure.
+ if (!PassPA)
+ continue;
// If the loop was deleted, abort the run and return to the outer walk.
if (U.skipCurrentLoop()) {
- PA.intersect(std::move(PassPA));
+ PA.intersect(std::move(*PassPA));
break;
}
-#ifndef NDEBUG
- // Verify the loop structure and LCSSA form before visiting the loop.
- L.verifyLoop();
- assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
- "Loops must remain in LCSSA form!");
-#endif
-
// Update the analysis manager as each pass runs and potentially
// invalidates analyses.
- AM.invalidate(L, PassPA);
+ AM.invalidate(L, *PassPA);
// Finally, we intersect the final preserved analyses to compute the
// aggregate preserved set for this pass manager.
- PA.intersect(std::move(PassPA));
+ PA.intersect(std::move(*PassPA));
+
+ // Check if the current pass preserved the loop-nest object or not.
+ IsLoopNestPtrValid &= PassPA->getChecker<LoopNestAnalysis>().preserved();
// FIXME: Historically, the pass managers all called the LLVM context's
// yield function here. We don't have a generic way to acquire the
@@ -81,21 +120,207 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
// in the new pass manager so it is currently omitted.
// ...getContext().yield();
}
+ return PA;
+}
- // Invalidation for the current loop should be handled above, and other loop
- // analysis results shouldn't be impacted by runs over this loop. Therefore,
- // the remaining analysis results in the AnalysisManager are preserved. We
- // mark this with a set so that we don't need to inspect each one
- // individually.
- // FIXME: This isn't correct! This loop and all nested loops' analyses should
- // be preserved, but unrolling should invalidate the parent loop's analyses.
- PA.preserveSet<AllAnalysesOn<Loop>>();
+// Run all loop passes on loop \p L. Loop-nest passes don't run either because
+// \p L is not a top-level one or simply because there are no loop-nest passes
+// in the pass manager at all.
+PreservedAnalyses
+LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ PreservedAnalyses PA = PreservedAnalyses::all();
- if (DebugLogging)
- dbgs() << "Finished Loop pass manager run.\n";
+ // Request PassInstrumentation from analysis manager, will use it to run
+ // instrumenting callbacks for the passes later.
+ PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
+ for (auto &Pass : LoopPasses) {
+ Optional<PreservedAnalyses> PassPA = runSinglePass(L, Pass, AM, AR, U, PI);
+
+ // `PassPA` is `None` means that the before-pass callbacks in
+ // `PassInstrumentation` return false. The pass does not run in this case,
+ // so we can skip the following procedure.
+ if (!PassPA)
+ continue;
+
+ // If the loop was deleted, abort the run and return to the outer walk.
+ if (U.skipCurrentLoop()) {
+ PA.intersect(std::move(*PassPA));
+ break;
+ }
+ // Update the analysis manager as each pass runs and potentially
+ // invalidates analyses.
+ AM.invalidate(L, *PassPA);
+
+ // Finally, we intersect the final preserved analyses to compute the
+ // aggregate preserved set for this pass manager.
+ PA.intersect(std::move(*PassPA));
+
+ // FIXME: Historically, the pass managers all called the LLVM context's
+ // yield function here. We don't have a generic way to acquire the
+ // context and it isn't yet clear what the right pattern is for yielding
+ // in the new pass manager so it is currently omitted.
+ // ...getContext().yield();
+ }
return PA;
}
+} // namespace llvm
+
+PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ // Before we even compute any loop analyses, first run a miniature function
+ // pass pipeline to put loops into their canonical form. Note that we can
+ // directly build up function analyses after this as the function pass
+ // manager handles all the invalidation at that layer.
+ PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(F);
+
+ PreservedAnalyses PA = PreservedAnalyses::all();
+ // Check the PassInstrumentation's BeforePass callbacks before running the
+ // canonicalization pipeline.
+ if (PI.runBeforePass<Function>(LoopCanonicalizationFPM, F)) {
+ PA = LoopCanonicalizationFPM.run(F, AM);
+ PI.runAfterPass<Function>(LoopCanonicalizationFPM, F, PA);
+ }
+
+ // Get the loop structure for this function
+ LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+
+ // If there are no loops, there is nothing to do here.
+ if (LI.empty())
+ return PA;
+
+ // Get the analysis results needed by loop passes.
+ MemorySSA *MSSA =
+ UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr;
+ BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData()
+ ? (&AM.getResult<BlockFrequencyAnalysis>(F))
+ : nullptr;
+ LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
+ AM.getResult<AssumptionAnalysis>(F),
+ AM.getResult<DominatorTreeAnalysis>(F),
+ AM.getResult<LoopAnalysis>(F),
+ AM.getResult<ScalarEvolutionAnalysis>(F),
+ AM.getResult<TargetLibraryAnalysis>(F),
+ AM.getResult<TargetIRAnalysis>(F),
+ BFI,
+ MSSA};
+
+ // Setup the loop analysis manager from its proxy. It is important that
+ // this is only done when there are loops to process and we have built the
+ // LoopStandardAnalysisResults object. The loop analyses cached in this
+ // manager have access to those analysis results and so it must invalidate
+ // itself when they go away.
+ auto &LAMFP = AM.getResult<LoopAnalysisManagerFunctionProxy>(F);
+ if (UseMemorySSA)
+ LAMFP.markMSSAUsed();
+ LoopAnalysisManager &LAM = LAMFP.getManager();
+
+ // A postorder worklist of loops to process.
+ SmallPriorityWorklist<Loop *, 4> Worklist;
+
+ // Register the worklist and loop analysis manager so that loop passes can
+ // update them when they mutate the loop nest structure.
+ LPMUpdater Updater(Worklist, LAM, LoopNestMode);
+
+ // Add the loop nests in the reverse order of LoopInfo. See method
+ // declaration.
+ if (!LoopNestMode) {
+ appendLoopsToWorklist(LI, Worklist);
+ } else {
+ for (Loop *L : LI)
+ Worklist.insert(L);
+ }
+
+#ifndef NDEBUG
+ PI.pushBeforeNonSkippedPassCallback([&LAR, &LI](StringRef PassID, Any IR) {
+ if (isSpecialPass(PassID, {"PassManager"}))
+ return;
+ assert(any_isa<const Loop *>(IR) || any_isa<const LoopNest *>(IR));
+ const Loop *L = any_isa<const Loop *>(IR)
+ ? any_cast<const Loop *>(IR)
+ : &any_cast<const LoopNest *>(IR)->getOutermostLoop();
+ assert(L && "Loop should be valid for printing");
+
+ // Verify the loop structure and LCSSA form before visiting the loop.
+ L->verifyLoop();
+ assert(L->isRecursivelyLCSSAForm(LAR.DT, LI) &&
+ "Loops must remain in LCSSA form!");
+ });
+#endif
+
+ do {
+ Loop *L = Worklist.pop_back_val();
+ assert(!(LoopNestMode && L->getParentLoop()) &&
+ "L should be a top-level loop in loop-nest mode.");
+
+ // Reset the update structure for this loop.
+ Updater.CurrentL = L;
+ Updater.SkipCurrentLoop = false;
+
+#ifndef NDEBUG
+ // Save a parent loop pointer for asserts.
+ Updater.ParentL = L->getParentLoop();
+#endif
+ // Check the PassInstrumentation's BeforePass callbacks before running the
+ // pass, skip its execution completely if asked to (callback returns
+ // false).
+ if (!PI.runBeforePass<Loop>(*Pass, *L))
+ continue;
+
+ PreservedAnalyses PassPA;
+ {
+ TimeTraceScope TimeScope(Pass->name());
+ PassPA = Pass->run(*L, LAM, LAR, Updater);
+ }
+
+ // Do not pass deleted Loop into the instrumentation.
+ if (Updater.skipCurrentLoop())
+ PI.runAfterPassInvalidated<Loop>(*Pass, PassPA);
+ else
+ PI.runAfterPass<Loop>(*Pass, *L, PassPA);
+
+ // FIXME: We should verify the set of analyses relevant to Loop passes
+ // are preserved.
+
+ // If the loop hasn't been deleted, we need to handle invalidation here.
+ if (!Updater.skipCurrentLoop())
+ // We know that the loop pass couldn't have invalidated any other
+ // loop's analyses (that's the contract of a loop pass), so directly
+ // handle the loop analysis manager's invalidation here.
+ LAM.invalidate(*L, PassPA);
+
+ // Then intersect the preserved set so that invalidation of module
+ // analyses will eventually occur when the module pass completes.
+ PA.intersect(std::move(PassPA));
+ } while (!Worklist.empty());
+
+#ifndef NDEBUG
+ PI.popBeforeNonSkippedPassCallback();
+#endif
+
+ // By definition we preserve the proxy. We also preserve all analyses on
+ // Loops. This precludes *any* invalidation of loop analyses by the proxy,
+ // but that's OK because we've taken care to invalidate analyses in the
+ // loop analysis manager incrementally above.
+ PA.preserveSet<AllAnalysesOn<Loop>>();
+ PA.preserve<LoopAnalysisManagerFunctionProxy>();
+ // We also preserve the set of standard analyses.
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ if (UseBlockFrequencyInfo && F.hasProfileData())
+ PA.preserve<BlockFrequencyAnalysis>();
+ if (UseMemorySSA)
+ PA.preserve<MemorySSAAnalysis>();
+ // FIXME: What we really want to do here is preserve an AA category, but
+ // that concept doesn't exist yet.
+ PA.preserve<AAManager>();
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<SCEVAA>();
+ return PA;
}
PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index edde22d6708f..4f97641e2027 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -362,7 +362,7 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
// For the new PM, we also can't use BranchProbabilityInfo as an analysis
// pass. Function analyses need to be preserved across loop transformations
// but BPI is not preserved, hence a newly built one is needed.
- BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI);
+ BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI, &AR.DT, nullptr);
LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
if (!LP.runOnLoop(&L))
return PreservedAnalyses::all();
@@ -439,8 +439,8 @@ static bool isSafeToTruncateWideIVType(const DataLayout &DL,
Type *RangeCheckType) {
if (!EnableIVTruncation)
return false;
- assert(DL.getTypeSizeInBits(LatchCheck.IV->getType()) >
- DL.getTypeSizeInBits(RangeCheckType) &&
+ assert(DL.getTypeSizeInBits(LatchCheck.IV->getType()).getFixedSize() >
+ DL.getTypeSizeInBits(RangeCheckType).getFixedSize() &&
"Expected latch check IV type to be larger than range check operand "
"type!");
// The start and end values of the IV should be known. This is to guarantee
@@ -454,13 +454,13 @@ static bool isSafeToTruncateWideIVType(const DataLayout &DL,
// LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
// IV wraps around, and the truncation of the IV would lose the range of
// iterations between 2^32 and 2^64.
- bool Increasing;
- if (!SE.isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+ if (!SE.getMonotonicPredicateType(LatchCheck.IV, LatchCheck.Pred))
return false;
// The active bits should be less than the bits in the RangeCheckType. This
// guarantees that truncating the latch check to RangeCheckType is a safe
// operation.
- auto RangeCheckTypeBitSize = DL.getTypeSizeInBits(RangeCheckType);
+ auto RangeCheckTypeBitSize =
+ DL.getTypeSizeInBits(RangeCheckType).getFixedSize();
return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
}
@@ -477,7 +477,8 @@ static Optional<LoopICmp> generateLoopLatchCheck(const DataLayout &DL,
if (RangeCheckType == LatchType)
return LatchCheck;
// For now, bail out if latch type is narrower than range type.
- if (DL.getTypeSizeInBits(LatchType) < DL.getTypeSizeInBits(RangeCheckType))
+ if (DL.getTypeSizeInBits(LatchType).getFixedSize() <
+ DL.getTypeSizeInBits(RangeCheckType).getFixedSize())
return None;
if (!isSafeToTruncateWideIVType(DL, SE, LatchCheck, RangeCheckType))
return None;
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 3542d0a4ee73..b3bae47e96de 100644
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -50,6 +50,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopReroll.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -161,12 +162,12 @@ namespace {
IL_End
};
- class LoopReroll : public LoopPass {
+ class LoopRerollLegacyPass : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
- LoopReroll() : LoopPass(ID) {
- initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+ LoopRerollLegacyPass() : LoopPass(ID) {
+ initializeLoopRerollLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -175,6 +176,15 @@ namespace {
AU.addRequired<TargetLibraryInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
+ };
+
+ class LoopReroll {
+ public:
+ LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE,
+ TargetLibraryInfo *TLI, DominatorTree *DT, bool PreserveLCSSA)
+ : AA(AA), LI(LI), SE(SE), TLI(TLI), DT(DT),
+ PreserveLCSSA(PreserveLCSSA) {}
+ bool runOnLoop(Loop *L);
protected:
AliasAnalysis *AA;
@@ -484,16 +494,16 @@ namespace {
} // end anonymous namespace
-char LoopReroll::ID = 0;
+char LoopRerollLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_BEGIN(LoopRerollLegacyPass, "loop-reroll", "Reroll loops",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_END(LoopRerollLegacyPass, "loop-reroll", "Reroll loops", false,
+ false)
-Pass *llvm::createLoopRerollPass() {
- return new LoopReroll;
-}
+Pass *llvm::createLoopRerollPass() { return new LoopRerollLegacyPass; }
// Returns true if the provided instruction is used outside the given loop.
// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
@@ -1086,7 +1096,7 @@ LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
UsesTy::iterator *StartI) {
UsesTy::iterator I = StartI ? *StartI : In.begin();
while (I != In.end() && (I->second.test(Val) == 0 ||
- Exclude.count(I->first) != 0))
+ Exclude.contains(I->first)))
++I;
return I;
}
@@ -1644,18 +1654,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
return true;
}
-bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
- if (skipLoop(L))
- return false;
-
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent());
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-
+bool LoopReroll::runOnLoop(Loop *L) {
BasicBlock *Header = L->getHeader();
LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
<< Header->getName() << " (" << L->getNumBlocks()
@@ -1704,3 +1703,26 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
return Changed;
}
+
+bool LoopRerollLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ return LoopReroll(AA, LI, SE, TLI, DT, PreserveLCSSA).runOnLoop(L);
+}
+
+PreservedAnalyses LoopRerollPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ return LoopReroll(&AR.AA, &AR.LI, &AR.SE, &AR.TLI, &AR.DT, true).runOnLoop(&L)
+ ? getLoopPassPreservedAnalyses()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index f92566ba77ce..ad1cfc68ece0 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -12,6 +12,7 @@
#include "llvm/Transforms/Scalar/LoopRotation.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemorySSA.h"
@@ -33,22 +34,35 @@ static cl::opt<unsigned> DefaultRotationThreshold(
"rotation-max-header-size", cl::init(16), cl::Hidden,
cl::desc("The default maximum header size for automatic loop rotation"));
-LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
- : EnableHeaderDuplication(EnableHeaderDuplication) {}
+static cl::opt<bool> PrepareForLTOOption(
+ "rotation-prepare-for-lto", cl::init(false), cl::Hidden,
+ cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
+ "should be used for testing only."));
+
+LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
+ : EnableHeaderDuplication(EnableHeaderDuplication),
+ PrepareForLTO(PrepareForLTO) {}
PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
- int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
+ // Vectorization requires loop-rotation. Use default threshold for loops the
+ // user explicitly marked for vectorization, even when header duplication is
+ // disabled.
+ int Threshold = EnableHeaderDuplication ||
+ hasVectorizeTransformation(&L) == TM_ForcedByUser
+ ? DefaultRotationThreshold
+ : 0;
const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
Optional<MemorySSAUpdater> MSSAU;
if (AR.MSSA)
MSSAU = MemorySSAUpdater(AR.MSSA);
- bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
- MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
- SQ, false, Threshold, false);
+ bool Changed =
+ LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false,
+ Threshold, false, PrepareForLTO || PrepareForLTOOption);
if (!Changed)
return PreservedAnalyses::all();
@@ -66,10 +80,13 @@ namespace {
class LoopRotateLegacyPass : public LoopPass {
unsigned MaxHeaderSize;
+ bool PrepareForLTO;
public:
static char ID; // Pass ID, replacement for typeid
- LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+ LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1,
+ bool PrepareForLTO = false)
+ : LoopPass(ID), PrepareForLTO(PrepareForLTO) {
initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
if (SpecifiedMaxHeaderSize == -1)
MaxHeaderSize = DefaultRotationThreshold;
@@ -105,9 +122,17 @@ public:
if (MSSAA)
MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
}
+ // Vectorization requires loop-rotation. Use default threshold for loops the
+ // user explicitly marked for vectorization, even when header duplication is
+ // disabled.
+ int Threshold = hasVectorizeTransformation(L) == TM_ForcedByUser
+ ? DefaultRotationThreshold
+ : MaxHeaderSize;
+
return LoopRotation(L, LI, TTI, AC, &DT, &SE,
MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
- false, MaxHeaderSize, false);
+ false, Threshold, false,
+ PrepareForLTO || PrepareForLTOOption);
}
};
} // end namespace
@@ -122,6 +147,6 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
false)
-Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
- return new LoopRotateLegacyPass(MaxHeaderSize);
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize, bool PrepareForLTO) {
+ return new LoopRotateLegacyPass(MaxHeaderSize, PrepareForLTO);
}
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 031e5b9c1d2c..cc6d11220807 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -16,7 +16,6 @@
#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/DependenceAnalysis.h"
@@ -366,15 +365,20 @@ private:
unsigned DummyIdx = 1;
for (BasicBlock *BB : DeadExitBlocks) {
- SmallVector<Instruction *, 4> DeadPhis;
+ // Eliminate all Phis and LandingPads from dead exits.
+ // TODO: Consider removing all instructions in this dead block.
+ SmallVector<Instruction *, 4> DeadInstructions;
for (auto &PN : BB->phis())
- DeadPhis.push_back(&PN);
+ DeadInstructions.push_back(&PN);
- // Eliminate all Phis from dead exits.
- for (Instruction *PN : DeadPhis) {
- PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
- PN->eraseFromParent();
+ if (auto *LandingPad = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
+ DeadInstructions.emplace_back(LandingPad);
+
+ for (Instruction *I : DeadInstructions) {
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->eraseFromParent();
}
+
assert(DummyIdx != 0 && "Too many dead exits!");
DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB);
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
@@ -410,9 +414,10 @@ private:
FixLCSSALoop = FixLCSSALoop->getParentLoop();
assert(FixLCSSALoop && "Should be a loop!");
// We need all DT updates to be done before forming LCSSA.
- DTU.applyUpdates(DTUpdates);
if (MSSAU)
- MSSAU->applyUpdates(DTUpdates, DT);
+ MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
+ else
+ DTU.applyUpdates(DTUpdates);
DTUpdates.clear();
formLCSSARecursively(*FixLCSSALoop, DT, &LI, &SE);
}
@@ -420,8 +425,7 @@ private:
if (MSSAU) {
// Clear all updates now. Facilitates deletes that follow.
- DTU.applyUpdates(DTUpdates);
- MSSAU->applyUpdates(DTUpdates, DT);
+ MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
DTUpdates.clear();
if (VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
@@ -447,7 +451,7 @@ private:
if (LI.isLoopHeader(BB)) {
assert(LI.getLoopFor(BB) != &L && "Attempt to remove current loop!");
Loop *DL = LI.getLoopFor(BB);
- if (DL->getParentLoop()) {
+ if (!DL->isOutermost()) {
for (auto *PL = DL->getParentLoop(); PL; PL = PL->getParentLoop())
for (auto *BB : DL->getBlocks())
PL->removeBlockFromLoop(BB);
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 1c03a4bf6c02..47698fdde69f 100644
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -39,6 +39,8 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/IR/Dominators.h"
@@ -67,6 +69,14 @@ static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
"max-uses-for-sinking", cl::Hidden, cl::init(30),
cl::desc("Do not sink instructions that have too many uses."));
+static cl::opt<bool> EnableMSSAInLoopSink(
+ "enable-mssa-in-loop-sink", cl::Hidden, cl::init(true),
+ cl::desc("Enable MemorySSA for LoopSink in new pass manager"));
+
+static cl::opt<bool> EnableMSSAInLegacyLoopSink(
+ "enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false),
+ cl::desc("Enable MemorySSA for LoopSink in legacy pass manager"));
+
/// Return adjusted total frequency of \p BBs.
///
/// * If there is only one BB, sinking instruction will not introduce code
@@ -172,11 +182,10 @@ findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
// sinking is successful.
// \p LoopBlockNumber is used to sort the insertion blocks to ensure
// determinism.
-static bool sinkInstruction(Loop &L, Instruction &I,
- const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
- const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
- LoopInfo &LI, DominatorTree &DT,
- BlockFrequencyInfo &BFI) {
+static bool sinkInstruction(
+ Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+ const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI,
+ DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) {
// Compute the set of blocks in loop L which contain a use of I.
SmallPtrSet<BasicBlock *, 2> BBs;
for (auto &U : I.uses()) {
@@ -213,8 +222,7 @@ static bool sinkInstruction(Loop &L, Instruction &I,
// of the loop block numbers as iterating the set doesn't give a useful
// order. No need to stable sort as the block numbers are a total ordering.
SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
- SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
- BBsToSinkInto.end());
+ llvm::append_range(SortedBBsToSinkInto, BBsToSinkInto);
llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
});
@@ -230,6 +238,21 @@ static bool sinkInstruction(Loop &L, Instruction &I,
Instruction *IC = I.clone();
IC->setName(I.getName());
IC->insertBefore(&*N->getFirstInsertionPt());
+
+ if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+ // Create a new MemoryAccess and let MemorySSA set its defining access.
+ MemoryAccess *NewMemAcc =
+ MSSAU->createMemoryAccessInBB(IC, nullptr, N, MemorySSA::Beginning);
+ if (NewMemAcc) {
+ if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
+ MSSAU->insertDef(MemDef, /*RenameUses=*/true);
+ else {
+ auto *MemUse = cast<MemoryUse>(NewMemAcc);
+ MSSAU->insertUse(MemUse, /*RenameUses=*/true);
+ }
+ }
+ }
+
// Replaces uses of I with IC in N
I.replaceUsesWithIf(IC, [N](Use &U) {
return cast<Instruction>(U.getUser())->getParent() == N;
@@ -244,6 +267,11 @@ static bool sinkInstruction(Loop &L, Instruction &I,
NumLoopSunk++;
I.moveBefore(&*MoveBB->getFirstInsertionPt());
+ if (MSSAU)
+ if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
+ MSSAU->getMemorySSA()->getMemoryAccess(&I)))
+ MSSAU->moveToPlace(OldMemAcc, MoveBB, MemorySSA::Beginning);
+
return true;
}
@@ -252,15 +280,14 @@ static bool sinkInstruction(Loop &L, Instruction &I,
static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
DominatorTree &DT,
BlockFrequencyInfo &BFI,
- ScalarEvolution *SE) {
+ ScalarEvolution *SE,
+ AliasSetTracker *CurAST,
+ MemorySSA *MSSA) {
BasicBlock *Preheader = L.getLoopPreheader();
- if (!Preheader)
- return false;
+ assert(Preheader && "Expected loop to have preheader");
- // Enable LoopSink only when runtime profile is available.
- // With static profile, the sinking decision may be sub-optimal.
- if (!Preheader->getParent()->hasProfileData())
- return false;
+ assert(Preheader->getParent()->hasProfileData() &&
+ "Unexpected call when profile data unavailable.");
const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
// If there are no basic blocks with lower frequency than the preheader then
@@ -271,13 +298,15 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
}))
return false;
- bool Changed = false;
- AliasSetTracker CurAST(AA);
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+ std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags;
+ if (MSSA) {
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ LICMFlags =
+ std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA);
+ }
- // Compute alias set.
- for (BasicBlock *BB : L.blocks())
- CurAST.add(*BB);
- CurAST.add(*Preheader);
+ bool Changed = false;
// Sort loop's basic blocks by frequency
SmallVector<BasicBlock *, 10> ColdLoopBBs;
@@ -300,9 +329,11 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
// No need to check for instruction's operands are loop invariant.
assert(L.hasLoopInvariantOperands(I) &&
"Insts in a loop's preheader should have loop invariant operands!");
- if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr, false))
+ if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
+ LICMFlags.get()))
continue;
- if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
+ if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
+ MSSAU.get()))
Changed = true;
}
@@ -311,6 +342,13 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
return Changed;
}
+static void computeAliasSet(Loop &L, BasicBlock &Preheader,
+ AliasSetTracker &CurAST) {
+ for (BasicBlock *BB : L.blocks())
+ CurAST.add(*BB);
+ CurAST.add(Preheader);
+}
+
PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
// Nothing to do if there are no loops.
@@ -321,6 +359,10 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ MemorySSA *MSSA = EnableMSSAInLoopSink
+ ? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA()
+ : nullptr;
+
// We want to do a postorder walk over the loops. Since loops are a tree this
// is equivalent to a reversed preorder walk and preorder is easy to compute
// without recursion. Since we reverse the preorder, we will visit siblings
@@ -332,11 +374,27 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
do {
Loop &L = *PreorderLoops.pop_back_val();
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader)
+ continue;
+
+ // Enable LoopSink only when runtime profile is available.
+ // With static profile, the sinking decision may be sub-optimal.
+ if (!Preheader->getParent()->hasProfileData())
+ continue;
+
+ std::unique_ptr<AliasSetTracker> CurAST;
+ if (!EnableMSSAInLoopSink) {
+ CurAST = std::make_unique<AliasSetTracker>(AA);
+ computeAliasSet(L, *Preheader, *CurAST.get());
+ }
+
// Note that we don't pass SCEV here because it is only used to invalidate
// loops in SCEV and we don't preserve (or request) SCEV at all making that
// unnecessary.
Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
- /*ScalarEvolution*/ nullptr);
+ /*ScalarEvolution*/ nullptr,
+ CurAST.get(), MSSA);
} while (!PreorderLoops.empty());
if (!Changed)
@@ -344,6 +402,14 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
+
+ if (MSSA) {
+ PA.preserve<MemorySSAAnalysis>();
+
+ if (VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ }
+
return PA;
}
@@ -358,19 +424,46 @@ struct LegacyLoopSinkPass : public LoopPass {
if (skipLoop(L))
return false;
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ // Enable LoopSink only when runtime profile is available.
+ // With static profile, the sinking decision may be sub-optimal.
+ if (!Preheader->getParent()->hasProfileData())
+ return false;
+
+ AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
- return sinkLoopInvariantInstructions(
- *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
- getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+ std::unique_ptr<AliasSetTracker> CurAST;
+ MemorySSA *MSSA = nullptr;
+ if (EnableMSSAInLegacyLoopSink)
+ MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ else {
+ CurAST = std::make_unique<AliasSetTracker>(AA);
+ computeAliasSet(*L, *Preheader, *CurAST.get());
+ }
+
+ bool Changed = sinkLoopInvariantInstructions(
+ *L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
- SE ? &SE->getSE() : nullptr);
+ SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA);
+
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ return Changed;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<BlockFrequencyInfoWrapperPass>();
getLoopAnalysisUsage(AU);
+ if (EnableMSSAInLegacyLoopSink) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
}
};
}
@@ -380,6 +473,7 @@ INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index cf02ef1e83f3..5dec9b542076 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -75,11 +75,13 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalValue.h"
@@ -422,7 +424,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L,
// Handle a multiplication by -1 (negation) if it didn't fold.
if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
if (Mul->getOperand(0)->isAllOnesValue()) {
- SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+ SmallVector<const SCEV *, 4> Ops(drop_begin(Mul->operands()));
const SCEV *NewMul = SE.getMulExpr(Ops);
SmallVector<const SCEV *, 4> MyGood;
@@ -483,11 +485,10 @@ bool Formula::isCanonical(const Loop &L) const {
// If ScaledReg is not a recurrent expr, or it is but its loop is not current
// loop, meanwhile BaseRegs contains a recurrent expr reg related with current
// loop, we want to swap the reg in BaseRegs with ScaledReg.
- auto I =
- find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
- return isa<const SCEVAddRecExpr>(S) &&
- (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
- });
+ auto I = find_if(BaseRegs, [&](const SCEV *S) {
+ return isa<const SCEVAddRecExpr>(S) &&
+ (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+ });
return I == BaseRegs.end();
}
@@ -506,8 +507,7 @@ void Formula::canonicalize(const Loop &L) {
// Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
if (!ScaledReg) {
- ScaledReg = BaseRegs.back();
- BaseRegs.pop_back();
+ ScaledReg = BaseRegs.pop_back_val();
Scale = 1;
}
@@ -516,11 +516,10 @@ void Formula::canonicalize(const Loop &L) {
// reg with ScaledReg.
const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
if (!SAR || SAR->getLoop() != &L) {
- auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
- [&](const SCEV *S) {
- return isa<const SCEVAddRecExpr>(S) &&
- (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
- });
+ auto I = find_if(BaseRegs, [&](const SCEV *S) {
+ return isa<const SCEVAddRecExpr>(S) &&
+ (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+ });
if (I != BaseRegs.end())
std::swap(ScaledReg, *I);
}
@@ -753,13 +752,13 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
return C->getValue()->getSExtValue();
}
} else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
- SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+ SmallVector<const SCEV *, 8> NewOps(Add->operands());
int64_t Result = ExtractImmediate(NewOps.front(), SE);
if (Result != 0)
S = SE.getAddExpr(NewOps);
return Result;
} else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
- SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+ SmallVector<const SCEV *, 8> NewOps(AR->operands());
int64_t Result = ExtractImmediate(NewOps.front(), SE);
if (Result != 0)
S = SE.getAddRecExpr(NewOps, AR->getLoop(),
@@ -779,13 +778,13 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
return GV;
}
} else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
- SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+ SmallVector<const SCEV *, 8> NewOps(Add->operands());
GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
if (Result)
S = SE.getAddExpr(NewOps);
return Result;
} else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
- SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+ SmallVector<const SCEV *, 8> NewOps(AR->operands());
GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
if (Result)
S = SE.getAddRecExpr(NewOps, AR->getLoop(),
@@ -935,6 +934,8 @@ static bool isHighCostExpansion(const SCEV *S,
case scSignExtend:
return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
Processed, SE);
+ default:
+ break;
}
if (!Processed.insert(S).second)
@@ -1210,7 +1211,7 @@ static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
return 0;
if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
return getSetupCost(S->getStart(), Depth - 1);
- if (auto S = dyn_cast<SCEVCastExpr>(Reg))
+ if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
return getSetupCost(S->getOperand(), Depth - 1);
if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
return std::accumulate(S->op_begin(), S->op_end(), 0,
@@ -2786,6 +2787,7 @@ static const SCEV *getExprBase(const SCEV *S) {
case scAddRecExpr:
return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
}
+ llvm_unreachable("Unknown SCEV kind!");
}
/// Return true if the chain increment is profitable to expand into a loop
@@ -2861,7 +2863,6 @@ static bool isProfitableChain(IVChain &Chain,
for (const IVInc &Inc : Chain) {
if (TTI.isProfitableLSRChainElement(Inc.UserInst))
return true;
-
if (Inc.IncExpr->isZero())
continue;
@@ -3401,7 +3402,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
Worklist.append(N->op_begin(), N->op_end());
- else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+ else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
Worklist.push_back(C->getOperand());
else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
Worklist.push_back(D->getLHS());
@@ -3834,10 +3835,14 @@ void LSRInstance::GenerateConstantOffsetsImpl(
F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
return;
- if (IsScaledReg)
+ if (IsScaledReg) {
F.ScaledReg = G;
- else
+ } else {
F.BaseRegs[Idx] = G;
+ // We may generate non canonical Formula if G is a recurrent expr reg
+ // related with current loop while F.ScaledReg is not.
+ F.canonicalize(*L);
+ }
(void)InsertFormula(LU, LUIdx, F);
}
@@ -5378,10 +5383,11 @@ void LSRInstance::RewriteForPHI(
// Split the critical edge.
BasicBlock *NewBB = nullptr;
if (!Parent->isLandingPad()) {
- NewBB = SplitCriticalEdge(BB, Parent,
- CriticalEdgeSplittingOptions(&DT, &LI)
- .setMergeIdenticalEdges()
- .setKeepOneInputPHIs());
+ NewBB =
+ SplitCriticalEdge(BB, Parent,
+ CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
+ .setMergeIdenticalEdges()
+ .setKeepOneInputPHIs());
} else {
SmallVector<BasicBlock*, 2> NewBBs;
SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
@@ -5514,8 +5520,8 @@ void LSRInstance::ImplementSolution(
// we can remove them after we are done working.
SmallVector<WeakTrackingVH, 16> DeadInsts;
- SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
- "lsr");
+ SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr",
+ false);
#ifndef NDEBUG
Rewriter.setDebugType(DEBUG_TYPE);
#endif
@@ -5614,13 +5620,19 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
if (IU.empty()) return;
// Skip nested loops until we can model them better with formulae.
- if (!L->empty()) {
+ if (!L->isInnermost()) {
LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
return;
}
// Start collecting data and preparing for the solver.
- CollectChains();
+ // If number of registers is not the major cost, we cannot benefit from the
+ // current profitable chain optimization which is based on number of
+ // registers.
+ // FIXME: add profitable chain optimization for other kinds major cost, for
+ // example number of instructions.
+ if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
+ CollectChains();
CollectInterestingTypesAndFactors();
CollectFixupsAndInitialFormulae();
CollectLoopInvariantFixupsAndFormulae();
@@ -5760,6 +5772,63 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<MemorySSAWrapperPass>();
}
+using EqualValues = SmallVector<std::tuple<WeakVH, int64_t, DIExpression *>, 4>;
+using EqualValuesMap = DenseMap<DbgValueInst *, EqualValues>;
+
+static void DbgGatherEqualValues(Loop *L, ScalarEvolution &SE,
+ EqualValuesMap &DbgValueToEqualSet) {
+ for (auto &B : L->getBlocks()) {
+ for (auto &I : *B) {
+ auto DVI = dyn_cast<DbgValueInst>(&I);
+ if (!DVI)
+ continue;
+ auto V = DVI->getVariableLocation();
+ if (!V || !SE.isSCEVable(V->getType()))
+ continue;
+ auto DbgValueSCEV = SE.getSCEV(V);
+ EqualValues EqSet;
+ for (PHINode &Phi : L->getHeader()->phis()) {
+ if (V->getType() != Phi.getType())
+ continue;
+ if (!SE.isSCEVable(Phi.getType()))
+ continue;
+ auto PhiSCEV = SE.getSCEV(&Phi);
+ Optional<APInt> Offset =
+ SE.computeConstantDifference(DbgValueSCEV, PhiSCEV);
+ if (Offset && Offset->getMinSignedBits() <= 64)
+ EqSet.emplace_back(std::make_tuple(
+ &Phi, Offset.getValue().getSExtValue(), DVI->getExpression()));
+ }
+ DbgValueToEqualSet[DVI] = std::move(EqSet);
+ }
+ }
+}
+
+static void DbgApplyEqualValues(EqualValuesMap &DbgValueToEqualSet) {
+ for (auto A : DbgValueToEqualSet) {
+ auto DVI = A.first;
+ // Only update those that are now undef.
+ if (!isa_and_nonnull<UndefValue>(DVI->getVariableLocation()))
+ continue;
+ for (auto EV : A.second) {
+ auto V = std::get<WeakVH>(EV);
+ if (!V)
+ continue;
+ auto DbgDIExpr = std::get<DIExpression *>(EV);
+ auto Offset = std::get<int64_t>(EV);
+ auto &Ctx = DVI->getContext();
+ DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
+ if (Offset) {
+ SmallVector<uint64_t, 8> Ops;
+ DIExpression::appendOffset(Ops, Offset);
+ DbgDIExpr = DIExpression::prependOpcodes(DbgDIExpr, Ops, true);
+ }
+ DVI->setOperand(2, MetadataAsValue::get(Ctx, DbgDIExpr));
+ break;
+ }
+ }
+}
+
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
DominatorTree &DT, LoopInfo &LI,
const TargetTransformInfo &TTI,
@@ -5775,12 +5844,17 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
Changed |=
LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
+ // Debug preservation - before we start removing anything create equivalence
+ // sets for the llvm.dbg.value intrinsics.
+ EqualValuesMap DbgValueToEqualSet;
+ DbgGatherEqualValues(L, SE, DbgValueToEqualSet);
+
// Remove any extra phis created by processing inner loops.
Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
if (EnablePhiElim && L->isLoopSimplifyForm()) {
SmallVector<WeakTrackingVH, 16> DeadInsts;
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
- SCEVExpander Rewriter(SE, DL, "lsr");
+ SCEVExpander Rewriter(SE, DL, "lsr", false);
#ifndef NDEBUG
Rewriter.setDebugType(DEBUG_TYPE);
#endif
@@ -5792,6 +5866,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
}
}
+
+ DbgApplyEqualValues(DbgValueToEqualSet);
+
return Changed;
}
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 285cba6ee205..495906e1a763 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -41,6 +41,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -287,6 +288,13 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
None, None, None, None, None);
TargetTransformInfo::PeelingPreferences PP =
gatherPeelingPreferences(L, SE, TTI, None, None);
+
+ TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
+ if (EnableMode & TM_Disable)
+ return LoopUnrollResult::Unmodified;
+ if (EnableMode & TM_ForcedByUser)
+ UP.UnrollAndJam = true;
+
if (AllowUnrollAndJam.getNumOccurrences() > 0)
UP.UnrollAndJam = AllowUnrollAndJam;
if (UnrollAndJamThreshold.getNumOccurrences() > 0)
@@ -299,10 +307,6 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
<< L->getHeader()->getParent()->getName() << "] Loop %"
<< L->getHeader()->getName() << "\n");
- TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
- if (EnableMode & TM_Disable)
- return LoopUnrollResult::Unmodified;
-
// A loop with any unroll pragma (enabling/disabling/count/etc) is left for
// the unroller, so long as it does not explicitly have unroll_and_jam
// metadata. This means #pragma nounroll will disable unroll and jam as well
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 87f40bb7ba85..1b974576a3cc 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -56,6 +56,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
@@ -75,13 +76,19 @@ using namespace llvm;
cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
"forget-scev-loop-unroll", cl::init(false), cl::Hidden,
cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
- " the current top-most loop. This is somtimes preferred to reduce"
+ " the current top-most loop. This is sometimes preferred to reduce"
" compile time."));
static cl::opt<unsigned>
UnrollThreshold("unroll-threshold", cl::Hidden,
cl::desc("The cost threshold for loop unrolling"));
+static cl::opt<unsigned>
+ UnrollOptSizeThreshold(
+ "unroll-optsize-threshold", cl::init(0), cl::Hidden,
+ cl::desc("The cost threshold for loop unrolling when optimizing for "
+ "size"));
+
static cl::opt<unsigned> UnrollPartialThreshold(
"unroll-partial-threshold", cl::Hidden,
cl::desc("The cost threshold for partial loop unrolling"));
@@ -115,10 +122,6 @@ static cl::opt<unsigned> UnrollFullMaxCount(
cl::desc(
"Set the max unroll count for full unrolling, for testing purposes"));
-static cl::opt<unsigned> UnrollPeelCount(
- "unroll-peel-count", cl::Hidden,
- cl::desc("Set the unroll peeling count, for testing purposes"));
-
static cl::opt<bool>
UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
cl::desc("Allows loops to be partially unrolled until "
@@ -149,15 +152,6 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
"threshold, the loop is considered as flat and will be less "
"aggressively unrolled."));
-static cl::opt<bool>
- UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
- cl::desc("Allows loops to be peeled when the dynamic "
- "trip count is known to be low."));
-
-static cl::opt<bool> UnrollAllowLoopNestsPeeling(
- "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden,
- cl::desc("Allows loop nests to be peeled."));
-
static cl::opt<bool> UnrollUnrollRemainder(
"unroll-remainder", cl::Hidden,
cl::desc("Allow the loop remainder to be unrolled."));
@@ -200,9 +194,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
UP.Threshold =
OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
UP.MaxPercentThresholdBoost = 400;
- UP.OptSizeThreshold = 0;
+ UP.OptSizeThreshold = UnrollOptSizeThreshold;
UP.PartialThreshold = 150;
- UP.PartialOptSizeThreshold = 0;
+ UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
UP.Count = 0;
UP.DefaultUnrollRuntimeCount = 8;
UP.MaxCount = std::numeric_limits<unsigned>::max();
@@ -224,8 +218,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
// Apply size attributes
bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
- PGSOQueryType::IRPass);
+ // Let unroll hints / pragmas take precedence over PGSO.
+ (hasUnrollTransformation(L) != TM_ForcedByUser &&
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass));
if (OptForSize) {
UP.Threshold = UP.OptSizeThreshold;
UP.PartialThreshold = UP.PartialOptSizeThreshold;
@@ -275,39 +271,6 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
return UP;
}
-TargetTransformInfo::PeelingPreferences
-llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
- const TargetTransformInfo &TTI,
- Optional<bool> UserAllowPeeling,
- Optional<bool> UserAllowProfileBasedPeeling) {
- TargetTransformInfo::PeelingPreferences PP;
-
- // Default values
- PP.PeelCount = 0;
- PP.AllowPeeling = true;
- PP.AllowLoopNestsPeeling = false;
- PP.PeelProfiledIterations = true;
-
- // Get Target Specifc Values
- TTI.getPeelingPreferences(L, SE, PP);
-
- // User Specified Values using cl::opt
- if (UnrollPeelCount.getNumOccurrences() > 0)
- PP.PeelCount = UnrollPeelCount;
- if (UnrollAllowPeeling.getNumOccurrences() > 0)
- PP.AllowPeeling = UnrollAllowPeeling;
- if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
- PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
-
- // User Specifed values provided by argument
- if (UserAllowPeeling.hasValue())
- PP.AllowPeeling = *UserAllowPeeling;
- if (UserAllowProfileBasedPeeling.hasValue())
- PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
-
- return PP;
-}
-
namespace {
/// A struct to densely store the state of an instruction after unrolling at
@@ -384,7 +347,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
// Only analyze inner loops. We can't properly estimate cost of nested loops
// and we won't visit inner loops again anyway.
- if (!L->empty())
+ if (!L->isInnermost())
return None;
// Don't simulate loops with a big or unknown tripcount
@@ -426,6 +389,10 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
assert(CostWorklist.empty() && "Must start with an empty cost list");
assert(PHIUsedList.empty() && "Must start with an empty phi used list");
CostWorklist.push_back(&RootI);
+ TargetTransformInfo::TargetCostKind CostKind =
+ RootI.getFunction()->hasMinSize() ?
+ TargetTransformInfo::TCK_CodeSize :
+ TargetTransformInfo::TCK_SizeAndLatency;
for (;; --Iteration) {
do {
Instruction *I = CostWorklist.pop_back_val();
@@ -466,7 +433,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
// First accumulate the cost of this instruction.
if (!Cost.IsFree) {
- UnrolledCost += TTI.getUserCost(I, TargetTransformInfo::TCK_CodeSize);
+ UnrolledCost += TTI.getUserCost(I, CostKind);
LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
<< Iteration << "): ");
LLVM_DEBUG(I->dump());
@@ -506,6 +473,9 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+ TargetTransformInfo::TargetCostKind CostKind =
+ L->getHeader()->getParent()->hasMinSize() ?
+ TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
// Simulate execution of each iteration of the loop counting instructions,
// which would be simplified.
// Since the same load will take different values on different iterations,
@@ -559,7 +529,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
// Track this instruction's expected baseline cost when executing the
// rolled loop form.
- RolledDynamicCost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
+ RolledDynamicCost += TTI.getUserCost(&I, CostKind);
// Visit the instruction to analyze its loop cost after unrolling,
// and if the visitor returns true, mark the instruction as free after
@@ -881,7 +851,7 @@ bool llvm::computeUnrollCount(
}
// 4th priority is loop peeling.
- computePeelCount(L, LoopSize, UP, PP, TripCount, SE);
+ computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
if (PP.PeelCount) {
UP.Runtime = false;
UP.Count = 1;
@@ -1073,7 +1043,7 @@ static LoopUnrollResult tryToUnrollLoop(
return LoopUnrollResult::Unmodified;
}
- // When automtatic unrolling is disabled, do not unroll unless overridden for
+ // When automatic unrolling is disabled, do not unroll unless overridden for
// this loop.
if (OnlyWhenForced && !(TM & TM_Enable))
return LoopUnrollResult::Unmodified;
@@ -1087,7 +1057,7 @@ static LoopUnrollResult tryToUnrollLoop(
ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
ProvidedFullUnrollMaxCount);
TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
- L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling);
+ L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
// Exit early if unrolling is disabled. For OptForSize, we pick the loop size
// as threshold later on.
@@ -1135,7 +1105,7 @@ static LoopUnrollResult tryToUnrollLoop(
// If the loop contains a convergent operation, the prelude we'd add
// to do the first few instructions before we hit the unrolled loop
// is unsafe -- it adds a control-flow dependency to the convergent
- // operation. Therefore restrict remainder loop (try unrollig without).
+ // operation. Therefore restrict remainder loop (try unrolling without).
//
// TODO: This is quite conservative. In practice, convergent_op()
// is likely to be called unconditionally in the loop. In this
@@ -1331,7 +1301,7 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
bool ForgetAllSCEV) {
return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
- 0, 0, 0, 0);
+ 0, 0, 0, 1);
}
PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
@@ -1359,7 +1329,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
OnlyWhenForced, ForgetSCEV, /*Count*/ None,
/*Threshold*/ None, /*AllowPartial*/ false,
/*Runtime*/ false, /*UpperBound*/ false,
- /*AllowPeeling*/ false,
+ /*AllowPeeling*/ true,
/*AllowProfileBasedPeeling*/ false,
/*FullUnrollMaxCount*/ None) !=
LoopUnrollResult::Unmodified;
@@ -1401,7 +1371,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
}
// Otherwise erase the loop from the list if it was in the old loops.
- return OldLoops.count(SibLoop) != 0;
+ return OldLoops.contains(SibLoop);
});
Updater.addSiblingLoops(SibLoops);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 645a89bbd0ff..18717394d384 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
@@ -98,6 +99,12 @@ static cl::opt<unsigned>
Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
cl::init(100), cl::Hidden);
+static cl::opt<unsigned>
+ MSSAThreshold("loop-unswitch-memoryssa-threshold",
+ cl::desc("Max number of memory uses to explore during "
+ "partial unswitching analysis"),
+ cl::init(100), cl::Hidden);
+
namespace {
class LUAnalysisCache {
@@ -184,6 +191,7 @@ namespace {
Loop *CurrentLoop = nullptr;
DominatorTree *DT = nullptr;
MemorySSA *MSSA = nullptr;
+ AAResults *AA = nullptr;
std::unique_ptr<MemorySSAUpdater> MSSAU;
BasicBlock *LoopHeader = nullptr;
BasicBlock *LoopPreheader = nullptr;
@@ -217,6 +225,10 @@ namespace {
/// loop preheaders be inserted into the CFG.
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // Lazy BFI and BPI are marked as preserved here so Loop Unswitching
+ // can remain part of the same loop pass as LICM
+ AU.addPreserved<LazyBlockFrequencyInfoPass>();
+ AU.addPreserved<LazyBranchProbabilityInfoPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
if (EnableMSSALoopDependency) {
@@ -244,19 +256,22 @@ namespace {
bool tryTrivialLoopUnswitch(bool &Changed);
bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
- Instruction *TI = nullptr);
+ Instruction *TI = nullptr,
+ ArrayRef<Instruction *> ToDuplicate = {});
void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
BasicBlock *ExitBlock, Instruction *TI);
void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
- Instruction *TI);
+ Instruction *TI,
+ ArrayRef<Instruction *> ToDuplicate = {});
void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
Constant *Val, bool IsEqual);
- void emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
- BasicBlock *TrueDest,
- BasicBlock *FalseDest,
- BranchInst *OldBranch, Instruction *TI);
+ void
+ emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+ BasicBlock *TrueDest, BasicBlock *FalseDest,
+ BranchInst *OldBranch, Instruction *TI,
+ ArrayRef<Instruction *> ToDuplicate = {});
void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
@@ -523,6 +538,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
LPM = &LPMRef;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
if (EnableMSSALoopDependency) {
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
@@ -624,6 +640,145 @@ static bool equalityPropUnSafe(Value &LoopCond) {
return false;
}
+/// Check if the loop header has a conditional branch that is not
+/// loop-invariant, because it involves load instructions. If all paths from
+/// either the true or false successor to the header or loop exists do not
+/// modify the memory feeding the condition, perform 'partial unswitching'. That
+/// is, duplicate the instructions feeding the condition in the pre-header. Then
+/// unswitch on the duplicated condition. The condition is now known in the
+/// unswitched version for the 'invariant' path through the original loop.
+///
+/// If the branch condition of the header is partially invariant, return a pair
+/// containing the instructions to duplicate and a boolean Constant to update
+/// the condition in the loops created for the true or false successors.
+static std::pair<SmallVector<Instruction *, 4>, Constant *>
+hasPartialIVCondition(Loop *L, MemorySSA &MSSA, AAResults *AA) {
+ SmallVector<Instruction *, 4> ToDuplicate;
+
+ auto *TI = dyn_cast<BranchInst>(L->getHeader()->getTerminator());
+ if (!TI || !TI->isConditional())
+ return {};
+
+ auto *CondI = dyn_cast<CmpInst>(TI->getCondition());
+ // The case with the condition outside the loop should already be handled
+ // earlier.
+ if (!CondI || !L->contains(CondI))
+ return {};
+
+ ToDuplicate.push_back(CondI);
+
+ SmallVector<Value *, 4> WorkList;
+ WorkList.append(CondI->op_begin(), CondI->op_end());
+
+ SmallVector<MemoryAccess *, 4> AccessesToCheck;
+ SmallVector<MemoryLocation, 4> AccessedLocs;
+ while (!WorkList.empty()) {
+ Instruction *I = dyn_cast<Instruction>(WorkList.pop_back_val());
+ if (!I || !L->contains(I))
+ continue;
+
+ // TODO: support additional instructions.
+ if (!isa<LoadInst>(I) && !isa<GetElementPtrInst>(I))
+ return {};
+
+ // Do not duplicate volatile and atomic loads.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (LI->isVolatile() || LI->isAtomic())
+ return {};
+
+ ToDuplicate.push_back(I);
+ if (MemoryAccess *MA = MSSA.getMemoryAccess(I)) {
+ if (auto *MemUse = dyn_cast_or_null<MemoryUse>(MA)) {
+ // Queue the defining access to check for alias checks.
+ AccessesToCheck.push_back(MemUse->getDefiningAccess());
+ AccessedLocs.push_back(MemoryLocation::get(I));
+ } else {
+ // MemoryDefs may clobber the location or may be atomic memory
+ // operations. Bail out.
+ return {};
+ }
+ }
+ WorkList.append(I->op_begin(), I->op_end());
+ }
+
+ if (ToDuplicate.size() <= 1)
+ return {};
+
+ auto HasNoClobbersOnPath =
+ [L, AA, &AccessedLocs](BasicBlock *Succ, BasicBlock *Header,
+ SmallVector<MemoryAccess *, 4> AccessesToCheck) {
+ // First, collect all blocks in the loop that are on a patch from Succ
+ // to the header.
+ SmallVector<BasicBlock *, 4> WorkList;
+ WorkList.push_back(Succ);
+ WorkList.push_back(Header);
+ SmallPtrSet<BasicBlock *, 4> Seen;
+ Seen.insert(Header);
+ while (!WorkList.empty()) {
+ BasicBlock *Current = WorkList.pop_back_val();
+ if (!L->contains(Current))
+ continue;
+ const auto &SeenIns = Seen.insert(Current);
+ if (!SeenIns.second)
+ continue;
+
+ WorkList.append(succ_begin(Current), succ_end(Current));
+ }
+
+ // Require at least 2 blocks on a path through the loop. This skips
+ // paths that directly exit the loop.
+ if (Seen.size() < 2)
+ return false;
+
+ // Next, check if there are any MemoryDefs that are on the path through
+ // the loop (in the Seen set) and they may-alias any of the locations in
+ // AccessedLocs. If that is the case, they may modify the condition and
+ // partial unswitching is not possible.
+ SmallPtrSet<MemoryAccess *, 4> SeenAccesses;
+ while (!AccessesToCheck.empty()) {
+ MemoryAccess *Current = AccessesToCheck.pop_back_val();
+ auto SeenI = SeenAccesses.insert(Current);
+ if (!SeenI.second || !Seen.contains(Current->getBlock()))
+ continue;
+
+ // Bail out if exceeded the threshold.
+ if (SeenAccesses.size() >= MSSAThreshold)
+ return false;
+
+ // MemoryUse are read-only accesses.
+ if (isa<MemoryUse>(Current))
+ continue;
+
+ // For a MemoryDef, check if is aliases any of the location feeding
+ // the original condition.
+ if (auto *CurrentDef = dyn_cast<MemoryDef>(Current)) {
+ if (any_of(AccessedLocs, [AA, CurrentDef](MemoryLocation &Loc) {
+ return isModSet(
+ AA->getModRefInfo(CurrentDef->getMemoryInst(), Loc));
+ }))
+ return false;
+ }
+
+ for (Use &U : Current->uses())
+ AccessesToCheck.push_back(cast<MemoryAccess>(U.getUser()));
+ }
+
+ return true;
+ };
+
+ // If we branch to the same successor, partial unswitching will not be
+ // beneficial.
+ if (TI->getSuccessor(0) == TI->getSuccessor(1))
+ return {};
+
+ if (HasNoClobbersOnPath(TI->getSuccessor(0), L->getHeader(), AccessesToCheck))
+ return {ToDuplicate, ConstantInt::getTrue(TI->getContext())};
+ if (HasNoClobbersOnPath(TI->getSuccessor(1), L->getHeader(), AccessesToCheck))
+ return {ToDuplicate, ConstantInt::getFalse(TI->getContext())};
+
+ return {};
+}
+
/// Do actual work and unswitch loop if possible and profitable.
bool LoopUnswitch::processCurrentLoop() {
bool Changed = false;
@@ -661,7 +816,7 @@ bool LoopUnswitch::processCurrentLoop() {
// FIXME: Use Function::hasOptSize().
if (OptimizeForSize ||
LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
- return false;
+ return Changed;
// Run through the instructions in the loop, keeping track of three things:
//
@@ -685,10 +840,10 @@ bool LoopUnswitch::processCurrentLoop() {
if (!CB)
continue;
if (CB->isConvergent())
- return false;
+ return Changed;
if (auto *II = dyn_cast<InvokeInst>(&I))
if (!II->getUnwindDest()->canSplitPredecessors())
- return false;
+ return Changed;
if (auto *II = dyn_cast<IntrinsicInst>(&I))
if (II->getIntrinsicID() == Intrinsic::experimental_guard)
Guards.push_back(II);
@@ -823,6 +978,28 @@ bool LoopUnswitch::processCurrentLoop() {
}
}
}
+
+ // Check if there is a header condition that is invariant along the patch from
+ // either the true or false successors to the header. This allows unswitching
+ // conditions depending on memory accesses, if there's a path not clobbering
+ // the memory locations. Check if this transform has been disabled using
+ // metadata, to avoid unswitching the same loop multiple times.
+ if (MSSA &&
+ !findOptionMDForLoop(CurrentLoop, "llvm.loop.unswitch.partial.disable")) {
+ auto ToDuplicate = hasPartialIVCondition(CurrentLoop, *MSSA, AA);
+ if (!ToDuplicate.first.empty()) {
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Found partially invariant condition "
+ << *ToDuplicate.first[0] << "\n");
+ ++NumBranches;
+ unswitchIfProfitable(ToDuplicate.first[0], ToDuplicate.second,
+ CurrentLoop->getHeader()->getTerminator(),
+ ToDuplicate.first);
+
+ RedoLoop = false;
+ return true;
+ }
+ }
+
return Changed;
}
@@ -880,7 +1057,8 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
/// simplify the loop. If we decide that this is profitable,
/// unswitch the loop, reprocess the pieces, then return true.
bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
- Instruction *TI) {
+ Instruction *TI,
+ ArrayRef<Instruction *> ToDuplicate) {
// Check to see if it would be profitable to unswitch current loop.
if (!BranchesInfo.costAllowsUnswitching()) {
LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
@@ -900,31 +1078,65 @@ bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
return false;
}
- unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI);
+ unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI, ToDuplicate);
return true;
}
/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
/// and remove (but not erase!) it from the function.
-void LoopUnswitch::emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
- BasicBlock *TrueDest,
- BasicBlock *FalseDest,
- BranchInst *OldBranch,
- Instruction *TI) {
+void LoopUnswitch::emitPreheaderBranchOnCondition(
+ Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest,
+ BranchInst *OldBranch, Instruction *TI,
+ ArrayRef<Instruction *> ToDuplicate) {
assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
assert(TrueDest != FalseDest && "Branch targets should be different");
+
// Insert a conditional branch on LIC to the two preheaders. The original
// code is the true version and the new code is the false version.
Value *BranchVal = LIC;
bool Swapped = false;
- if (!isa<ConstantInt>(Val) ||
- Val->getType() != Type::getInt1Ty(LIC->getContext()))
- BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
- else if (Val != ConstantInt::getTrue(Val->getContext())) {
- // We want to enter the new loop when the condition is true.
- std::swap(TrueDest, FalseDest);
- Swapped = true;
+
+ if (!ToDuplicate.empty()) {
+ ValueToValueMapTy Old2New;
+ for (Instruction *I : reverse(ToDuplicate)) {
+ auto *New = I->clone();
+ New->insertBefore(OldBranch);
+ RemapInstruction(New, Old2New,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ Old2New[I] = New;
+
+ if (MSSAU) {
+ MemorySSA *MSSA = MSSAU->getMemorySSA();
+ auto *MemA = dyn_cast_or_null<MemoryUse>(MSSA->getMemoryAccess(I));
+ if (!MemA)
+ continue;
+
+ Loop *L = LI->getLoopFor(I->getParent());
+ auto *DefiningAccess = MemA->getDefiningAccess();
+ // If the defining access is a MemoryPhi in the header, get the incoming
+ // value for the pre-header as defining access.
+ if (DefiningAccess->getBlock() == I->getParent()) {
+ if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
+ DefiningAccess =
+ MemPhi->getIncomingValueForBlock(L->getLoopPreheader());
+ }
+ }
+ MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(),
+ MemorySSA::BeforeTerminator);
+ }
+ }
+ BranchVal = Old2New[ToDuplicate[0]];
+ } else {
+
+ if (!isa<ConstantInt>(Val) ||
+ Val->getType() != Type::getInt1Ty(LIC->getContext()))
+ BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
+ else if (Val != ConstantInt::getTrue(Val->getContext())) {
+ // We want to enter the new loop when the condition is true.
+ std::swap(TrueDest, FalseDest);
+ Swapped = true;
+ }
}
// Old branch will be removed, so save its parent and successor to update the
@@ -955,10 +1167,11 @@ void LoopUnswitch::emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
}
- DT->applyUpdates(Updates);
if (MSSAU)
- MSSAU->applyUpdates(Updates, *DT);
+ MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
+ else
+ DT->applyUpdates(Updates);
}
// If either edge is critical, split it. This helps preserve LoopSimplify
@@ -1207,8 +1420,9 @@ void LoopUnswitch::splitExitEdges(
/// We determined that the loop is profitable to unswitch when LIC equal Val.
/// Split it into loop versions and test the condition outside of either loop.
/// Return the loops created as Out1/Out2.
-void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
- Loop *L, Instruction *TI) {
+void LoopUnswitch::unswitchNontrivialCondition(
+ Value *LIC, Constant *Val, Loop *L, Instruction *TI,
+ ArrayRef<Instruction *> ToDuplicate) {
Function *F = LoopHeader->getParent();
LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
<< LoopHeader->getName() << " [" << L->getBlocks().size()
@@ -1233,7 +1447,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
LoopBlocks.push_back(NewPreheader);
// We want the loop to come after the preheader, but before the exit blocks.
- LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+ llvm::append_range(LoopBlocks, L->blocks());
SmallVector<BasicBlock*, 8> ExitBlocks;
L->getUniqueExitBlocks(ExitBlocks);
@@ -1247,7 +1461,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
L->getUniqueExitBlocks(ExitBlocks);
// Add exit blocks to the loop blocks.
- LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+ llvm::append_range(LoopBlocks, ExitBlocks);
// Next step, clone all of the basic blocks that make up the loop (including
// the loop preheader and exit blocks), keeping track of the mapping between
@@ -1340,7 +1554,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
// Emit the new branch that selects between the two versions of this loop.
emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
- TI);
+ TI, ToDuplicate);
if (MSSAU) {
// Update MemoryPhis in Exit blocks.
MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
@@ -1362,17 +1576,38 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
// iteration.
WeakTrackingVH LICHandle(LIC);
- // Now we rewrite the original code to know that the condition is true and the
- // new code to know that the condition is false.
- rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
-
- // It's possible that simplifying one loop could cause the other to be
- // changed to another value or a constant. If its a constant, don't simplify
- // it.
- if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
- LICHandle && !isa<Constant>(LICHandle))
- rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val,
- /*IsEqual=*/true);
+ if (ToDuplicate.empty()) {
+ // Now we rewrite the original code to know that the condition is true and
+ // the new code to know that the condition is false.
+ rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
+
+ // It's possible that simplifying one loop could cause the other to be
+ // changed to another value or a constant. If its a constant, don't
+ // simplify it.
+ if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+ LICHandle && !isa<Constant>(LICHandle))
+ rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val,
+ /*IsEqual=*/true);
+ } else {
+ // Partial unswitching. Update the condition in the right loop with the
+ // constant.
+ auto *CC = cast<ConstantInt>(Val);
+ if (CC->isOneValue()) {
+ rewriteLoopBodyWithConditionConstant(NewLoop, VMap[LIC], Val,
+ /*IsEqual=*/true);
+ } else
+ rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/true);
+
+ // Mark the new loop as partially unswitched, to avoid unswitching on the
+ // same condition again.
+ auto &Context = NewLoop->getHeader()->getContext();
+ MDNode *DisableUnswitchMD = MDNode::get(
+ Context, MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
+ MDNode *NewLoopID = makePostTransformationMetadata(
+ Context, L->getLoopID(), {"llvm.loop.unswitch.partial"},
+ {DisableUnswitchMD});
+ NewLoop->setLoopID(NewLoopID);
+ }
if (MSSA && VerifyMemorySSA)
MSSA->verifyMemorySSA();
@@ -1381,9 +1616,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
/// Remove all instances of I from the worklist vector specified.
static void removeFromWorklist(Instruction *I,
std::vector<Instruction *> &Worklist) {
-
- Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
- Worklist.end());
+ llvm::erase_value(Worklist, I);
}
/// When we find that I really equals V, remove I from the
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 06b684ef1e70..2ff1e8480749 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -59,6 +59,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -114,17 +115,18 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
namespace {
-struct LoopVersioningLICM : public LoopPass {
+struct LoopVersioningLICMLegacyPass : public LoopPass {
static char ID;
- LoopVersioningLICM()
- : LoopPass(ID), LoopDepthThreshold(LVLoopDepthThreshold),
- InvariantThreshold(LVInvarThreshold) {
- initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+ LoopVersioningLICMLegacyPass() : LoopPass(ID) {
+ initializeLoopVersioningLICMLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ StringRef getPassName() const override { return "Loop Versioning for LICM"; }
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
@@ -138,13 +140,25 @@ struct LoopVersioningLICM : public LoopPass {
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
+};
- StringRef getPassName() const override { return "Loop Versioning for LICM"; }
+struct LoopVersioningLICM {
+ // We don't explicitly pass in LoopAccessInfo to the constructor since the
+ // loop versioning might return early due to instructions that are not safe
+ // for versioning. By passing the proxy instead the construction of
+ // LoopAccessInfo will take place only when it's necessary.
+ LoopVersioningLICM(AliasAnalysis *AA, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE,
+ function_ref<const LoopAccessInfo &(Loop *)> GetLAI)
+ : AA(AA), SE(SE), GetLAI(GetLAI),
+ LoopDepthThreshold(LVLoopDepthThreshold),
+ InvariantThreshold(LVInvarThreshold), ORE(ORE) {}
+
+ bool runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT);
void reset() {
AA = nullptr;
SE = nullptr;
- LAA = nullptr;
CurLoop = nullptr;
LoadAndStoreCounter = 0;
InvariantCounter = 0;
@@ -169,12 +183,12 @@ private:
// Current ScalarEvolution
ScalarEvolution *SE = nullptr;
- // Current LoopAccessAnalysis
- LoopAccessLegacyAnalysis *LAA = nullptr;
-
// Current Loop's LoopAccessInfo
const LoopAccessInfo *LAI = nullptr;
+ // Proxy for retrieving LoopAccessInfo.
+ function_ref<const LoopAccessInfo &(Loop *)> GetLAI;
+
// The current loop we are working on.
Loop *CurLoop = nullptr;
@@ -253,7 +267,7 @@ bool LoopVersioningLICM::legalLoopStructure() {
// We need to be able to compute the loop trip count in order
// to generate the bound checks.
const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
- if (ExitCount == SE->getCouldNotCompute()) {
+ if (isa<SCEVCouldNotCompute>(ExitCount)) {
LLVM_DEBUG(dbgs() << " loop does not has trip count\n");
return false;
}
@@ -400,8 +414,8 @@ bool LoopVersioningLICM::legalLoopInstructions() {
return false;
}
}
- // Get LoopAccessInfo from current loop.
- LAI = &LAA->getInfo(CurLoop);
+ // Get LoopAccessInfo from current loop via the proxy.
+ LAI = &GetLAI(CurLoop);
// Check LoopAccessInfo for need of runtime check.
if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
@@ -539,8 +553,8 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
MDBuilder MDB(I->getContext());
MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
StringRef Name = "LVAliasScope";
- SmallVector<Metadata *, 4> Scopes, NoAliases;
MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+ SmallVector<Metadata *, 4> Scopes{NewScope}, NoAliases{NewScope};
// Iterate over each instruction of loop.
// set no-alias for all load & store instructions.
for (auto *Block : CurLoop->getBlocks()) {
@@ -548,8 +562,6 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
// Only interested in instruction that may modify or read memory.
if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
continue;
- Scopes.push_back(NewScope);
- NoAliases.push_back(NewScope);
// Set no-alias for current instruction.
Inst.setMetadata(
LLVMContext::MD_noalias,
@@ -564,30 +576,38 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
}
}
-bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool LoopVersioningLICMLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ auto GetLAI = [&](Loop *L) -> const LoopAccessInfo & {
+ return getAnalysis<LoopAccessLegacyAnalysis>().getInfo(L);
+ };
+
+ return LoopVersioningLICM(AA, SE, ORE, GetLAI).runOnLoop(L, LI, DT);
+}
+
+bool LoopVersioningLICM::runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT) {
// This will automatically release all resources hold by the current
// LoopVersioningLICM object.
AutoResetter Resetter(*this);
- if (skipLoop(L))
- return false;
-
// Do not do the transformation if disabled by metadata.
if (hasLICMVersioningTransformation(L) & TM_Disable)
return false;
- // Get Analysis information.
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
- ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- LAI = nullptr;
// Set Current Loop
CurLoop = L;
CurAST.reset(new AliasSetTracker(*AA));
// Loop over the body of this loop, construct AST.
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
for (auto *Block : L->getBlocks()) {
if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
CurAST->add(*Block); // Incorporate the specified basic block
@@ -602,8 +622,8 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
// Do loop versioning.
// Create memcheck for memory accessed inside loop.
// Clone original loop, and set blocks properly.
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true);
+ LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
+ CurLoop, LI, DT, SE);
LVer.versionLoop();
// Set Loop Versioning metaData for original loop.
addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
@@ -621,9 +641,9 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
return Changed;
}
-char LoopVersioningLICM::ID = 0;
+char LoopVersioningLICMLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
+INITIALIZE_PASS_BEGIN(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
"Loop Versioning For LICM", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -634,7 +654,31 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
+INITIALIZE_PASS_END(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
"Loop Versioning For LICM", false, false)
-Pass *llvm::createLoopVersioningLICMPass() { return new LoopVersioningLICM(); }
+Pass *llvm::createLoopVersioningLICMPass() {
+ return new LoopVersioningLICMLegacyPass();
+}
+
+namespace llvm {
+
+PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &LAR,
+ LPMUpdater &U) {
+ AliasAnalysis *AA = &LAR.AA;
+ ScalarEvolution *SE = &LAR.SE;
+ DominatorTree *DT = &LAR.DT;
+ LoopInfo *LI = &LAR.LI;
+ const Function *F = L.getHeader()->getParent();
+ OptimizationRemarkEmitter ORE(F);
+
+ auto GetLAI = [&](Loop *L) -> const LoopAccessInfo & {
+ return AM.getResult<LoopAccessAnalysis>(*L, LAR);
+ };
+
+ if (!LoopVersioningLICM(AA, SE, &ORE, GetLAI).runOnLoop(&L, LI, DT))
+ return PreservedAnalyses::all();
+ return getLoopPassPreservedAnalyses();
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index fddf28c281fc..bfe8db83b027 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -78,7 +78,7 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
Other->removePredecessor(Source);
BI->eraseFromParent();
BranchInst::Create(Target, Source);
- if (pred_begin(Other) == pred_end(Other))
+ if (pred_empty(Other))
HasDeadBlocks = true;
}
}
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 0fe7dd9cfb39..da13075dfee2 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -24,10 +24,8 @@
#include "llvm/IR/Metadata.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/MisExpect.h"
using namespace llvm;
@@ -48,10 +46,10 @@ STATISTIC(ExpectIntrinsicsHandled,
// 'select' instructions. It may be worthwhile to hoist these values to some
// shared space, so they can be used directly by other passes.
-static cl::opt<uint32_t> LikelyBranchWeight(
+cl::opt<uint32_t> llvm::LikelyBranchWeight(
"likely-branch-weight", cl::Hidden, cl::init(2000),
cl::desc("Weight of the branch likely to be taken (default = 2000)"));
-static cl::opt<uint32_t> UnlikelyBranchWeight(
+cl::opt<uint32_t> llvm::UnlikelyBranchWeight(
"unlikely-branch-weight", cl::Hidden, cl::init(1),
cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
@@ -102,13 +100,7 @@ static bool handleSwitchExpect(SwitchInst &SI) {
uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
Weights[Index] = LikelyBranchWeightVal;
- SI.setMetadata(LLVMContext::MD_misexpect,
- MDBuilder(CI->getContext())
- .createMisExpect(Index, LikelyBranchWeightVal,
- UnlikelyBranchWeightVal));
-
SI.setCondition(ArgValue);
- misexpect::checkFrontendInstrumentation(SI);
SI.setMetadata(LLVMContext::MD_prof,
MDBuilder(CI->getContext()).createBranchWeights(Weights));
@@ -317,7 +309,6 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
MDBuilder MDB(CI->getContext());
MDNode *Node;
- MDNode *ExpNode;
uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
@@ -327,24 +318,16 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
(Predicate == CmpInst::ICMP_EQ)) {
Node =
MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
- ExpNode =
- MDB.createMisExpect(0, LikelyBranchWeightVal, UnlikelyBranchWeightVal);
} else {
Node =
MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
- ExpNode =
- MDB.createMisExpect(1, LikelyBranchWeightVal, UnlikelyBranchWeightVal);
}
- BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode);
-
if (CmpI)
CmpI->setOperand(0, ArgValue);
else
BSI.setCondition(ArgValue);
- misexpect::checkFrontendInstrumentation(BSI);
-
BSI.setMetadata(LLVMContext::MD_prof, Node);
return true;
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 90314b17b5e2..8e251ca940a3 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -42,6 +42,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/MatrixUtils.h"
using namespace llvm;
using namespace PatternMatch;
@@ -61,6 +63,9 @@ static cl::opt<unsigned> TileSize(
"fuse-matrix-tile-size", cl::init(4), cl::Hidden,
cl::desc(
"Tile size for matrix instruction fusion using square-shaped tiles."));
+static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),
+ cl::Hidden,
+ cl::desc("Generate loop nest for tiling."));
static cl::opt<bool> ForceFusion(
"force-fuse-matrix", cl::init(false), cl::Hidden,
cl::desc("Force matrix instruction fusion even if not profitable."));
@@ -182,10 +187,10 @@ class LowerMatrixIntrinsics {
Function &Func;
const DataLayout &DL;
const TargetTransformInfo &TTI;
- AliasAnalysis &AA;
- DominatorTree &DT;
- LoopInfo &LI;
- OptimizationRemarkEmitter &ORE;
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ OptimizationRemarkEmitter *ORE;
/// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
struct OpInfoTy {
@@ -241,7 +246,7 @@ class LowerMatrixIntrinsics {
void setVector(unsigned i, Value *V) { Vectors[i] = V; }
- Type *getElementType() { return getVectorTy()->getElementType(); }
+ Type *getElementType() const { return getVectorTy()->getElementType(); }
unsigned getNumVectors() const {
if (isColumnMajor())
@@ -271,7 +276,7 @@ class LowerMatrixIntrinsics {
return getVectorTy();
}
- VectorType *getVectorTy() {
+ VectorType *getVectorTy() const {
return cast<VectorType>(Vectors[0]->getType());
}
@@ -329,9 +334,8 @@ class LowerMatrixIntrinsics {
Value *extractVector(unsigned I, unsigned J, unsigned NumElts,
IRBuilder<> &Builder) const {
Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);
- Value *Undef = UndefValue::get(Vec->getType());
return Builder.CreateShuffleVector(
- Vec, Undef, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
+ Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
"block");
}
};
@@ -393,8 +397,8 @@ class LowerMatrixIntrinsics {
public:
LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
- AliasAnalysis &AA, DominatorTree &DT, LoopInfo &LI,
- OptimizationRemarkEmitter &ORE)
+ AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE)
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
LI(LI), ORE(ORE) {}
@@ -442,12 +446,11 @@ public:
// Otherwise split MatrixVal.
SmallVector<Value *, 16> SplitVecs;
- Value *Undef = UndefValue::get(VType);
for (unsigned MaskStart = 0;
MaskStart < cast<FixedVectorType>(VType)->getNumElements();
MaskStart += SI.getStride()) {
Value *V = Builder.CreateShuffleVector(
- MatrixVal, Undef, createSequentialMask(MaskStart, SI.getStride(), 0),
+ MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),
"split");
SplitVecs.push_back(V);
}
@@ -485,6 +488,7 @@ public:
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::FMul: // Scalar multiply.
+ case Instruction::FNeg:
case Instruction::Add:
case Instruction::Mul:
case Instruction::Sub:
@@ -527,8 +531,7 @@ public:
// list.
LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");
while (!WorkList.empty()) {
- Instruction *Inst = WorkList.back();
- WorkList.pop_back();
+ Instruction *Inst = WorkList.pop_back_val();
// New entry, set the value and insert operands
bool Propagate = false;
@@ -598,8 +601,7 @@ public:
// worklist.
LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
while (!WorkList.empty()) {
- Value *V = WorkList.back();
- WorkList.pop_back();
+ Value *V = WorkList.pop_back_val();
size_t BeforeProcessingV = WorkList.size();
if (!isa<Instruction>(V))
@@ -721,14 +723,18 @@ public:
Value *Op2;
if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
Changed |= VisitBinaryOperator(BinOp);
+ if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
+ Changed |= VisitUnaryOperator(UnOp);
if (match(Inst, m_Load(m_Value(Op1))))
Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
}
- RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
- RemarkGen.emitRemarks();
+ if (ORE) {
+ RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);
+ RemarkGen.emitRemarks();
+ }
for (Instruction *Inst : reverse(ToRemove))
Inst->eraseFromParent();
@@ -934,10 +940,8 @@ public:
unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();
assert(NumElts >= BlockNumElts && "Too few elements for current block");
- Value *Undef = UndefValue::get(Block->getType());
Block = Builder.CreateShuffleVector(
- Block, Undef,
- createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
+ Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
// If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
// 8, 4, 5, 6
@@ -1085,7 +1089,7 @@ public:
MemoryLocation StoreLoc = MemoryLocation::get(Store);
MemoryLocation LoadLoc = MemoryLocation::get(Load);
- AliasResult LdAliased = AA.alias(LoadLoc, StoreLoc);
+ AliasResult LdAliased = AA->alias(LoadLoc, StoreLoc);
// If we can statically determine noalias we're good.
if (!LdAliased)
@@ -1101,14 +1105,17 @@ public:
// as we adjust Check0 and Check1's branches.
SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
for (BasicBlock *Succ : successors(Check0))
- DTUpdates.push_back({DT.Delete, Check0, Succ});
+ DTUpdates.push_back({DT->Delete, Check0, Succ});
- BasicBlock *Check1 = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
- nullptr, "alias_cont");
+ BasicBlock *Check1 =
+ SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
+ nullptr, "alias_cont");
BasicBlock *Copy =
- SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, nullptr, "copy");
- BasicBlock *Fusion = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
- nullptr, "no_alias");
+ SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
+ nullptr, "copy");
+ BasicBlock *Fusion =
+ SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
+ nullptr, "no_alias");
// Check if the loaded memory location begins before the end of the store
// location. If the condition holds, they might overlap, otherwise they are
@@ -1152,11 +1159,11 @@ public:
PHI->addIncoming(NewLd, Copy);
// Adjust DT.
- DTUpdates.push_back({DT.Insert, Check0, Check1});
- DTUpdates.push_back({DT.Insert, Check0, Fusion});
- DTUpdates.push_back({DT.Insert, Check1, Copy});
- DTUpdates.push_back({DT.Insert, Check1, Fusion});
- DT.applyUpdates(DTUpdates);
+ DTUpdates.push_back({DT->Insert, Check0, Check1});
+ DTUpdates.push_back({DT->Insert, Check0, Fusion});
+ DTUpdates.push_back({DT->Insert, Check1, Copy});
+ DTUpdates.push_back({DT->Insert, Check1, Fusion});
+ DT->applyUpdates(DTUpdates);
return PHI;
}
@@ -1202,6 +1209,63 @@ public:
return Res;
}
+ void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,
+ Value *RPtr, ShapeInfo RShape, StoreInst *Store,
+ bool AllowContract) {
+ auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+ // Create the main tiling loop nest.
+ TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ Instruction *InsertI = cast<Instruction>(MatMul);
+ BasicBlock *Start = InsertI->getParent();
+ BasicBlock *End =
+ SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue");
+ IRBuilder<> Builder(MatMul);
+ BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI);
+
+ Type *TileVecTy =
+ FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize);
+ MatrixTy TileResult;
+ // Insert in the inner loop header.
+ Builder.SetInsertPoint(TI.InnerLoopHeader->getTerminator());
+ // Create PHI nodes for the result columns to accumulate across iterations.
+ SmallVector<PHINode *, 4> ColumnPhis;
+ for (unsigned I = 0; I < TileSize; I++) {
+ auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I));
+ Phi->addIncoming(ConstantAggregateZero::get(TileVecTy),
+ TI.RowLoopHeader->getSingleSuccessor());
+ TileResult.addVector(Phi);
+ ColumnPhis.push_back(Phi);
+ }
+
+ // Insert in the inner loop body, which computes
+ // Res += Load(CurrentRow, K) * Load(K, CurrentColumn)
+ Builder.SetInsertPoint(InnerBody->getTerminator());
+ // Load tiles of the operands.
+ MatrixTy A = loadMatrix(LPtr, {}, false, LShape, TI.CurrentRow, TI.CurrentK,
+ {TileSize, TileSize}, EltType, Builder);
+ MatrixTy B = loadMatrix(RPtr, {}, false, RShape, TI.CurrentK, TI.CurrentCol,
+ {TileSize, TileSize}, EltType, Builder);
+ emitMatrixMultiply(TileResult, A, B, AllowContract, Builder, true);
+ // Store result after the inner loop is done.
+ Builder.SetInsertPoint(TI.RowLoopLatch->getTerminator());
+ storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(),
+ Store->isVolatile(), {LShape.NumRows, RShape.NumColumns},
+ TI.CurrentRow, TI.CurrentCol, EltType, Builder);
+
+ for (unsigned I = 0; I < TileResult.getNumVectors(); I++)
+ ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.InnerLoopLatch);
+
+ // Force unrolling of a few iterations of the inner loop, to make sure there
+ // is enough work per iteration.
+ // FIXME: The unroller should make this decision directly instead, but
+ // currently the cost-model is not up to the task.
+ unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize);
+ addStringMetadataToLoop(LI->getLoopFor(TI.InnerLoopHeader),
+ "llvm.loop.unroll.count", InnerLoopUnrollCount);
+ }
+
void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
StoreInst *Store,
SmallPtrSetImpl<Instruction *> &FusedInsts) {
@@ -1224,28 +1288,34 @@ public:
bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
MatMul->hasAllowContract());
- IRBuilder<> Builder(Store);
- for (unsigned J = 0; J < C; J += TileSize)
- for (unsigned I = 0; I < R; I += TileSize) {
- const unsigned TileR = std::min(R - I, unsigned(TileSize));
- const unsigned TileC = std::min(C - J, unsigned(TileSize));
- MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
-
- for (unsigned K = 0; K < M; K += TileSize) {
- const unsigned TileM = std::min(M - K, unsigned(TileSize));
- MatrixTy A =
- loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
- LShape, Builder.getInt64(I), Builder.getInt64(K),
- {TileR, TileM}, EltType, Builder);
- MatrixTy B =
- loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
- RShape, Builder.getInt64(K), Builder.getInt64(J),
- {TileM, TileC}, EltType, Builder);
- emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+ if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
+ createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store,
+ AllowContract);
+ else {
+ IRBuilder<> Builder(Store);
+ for (unsigned J = 0; J < C; J += TileSize)
+ for (unsigned I = 0; I < R; I += TileSize) {
+ const unsigned TileR = std::min(R - I, unsigned(TileSize));
+ const unsigned TileC = std::min(C - J, unsigned(TileSize));
+ MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
+
+ for (unsigned K = 0; K < M; K += TileSize) {
+ const unsigned TileM = std::min(M - K, unsigned(TileSize));
+ MatrixTy A =
+ loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
+ LShape, Builder.getInt64(I), Builder.getInt64(K),
+ {TileR, TileM}, EltType, Builder);
+ MatrixTy B =
+ loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
+ RShape, Builder.getInt64(K), Builder.getInt64(J),
+ {TileM, TileC}, EltType, Builder);
+ emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+ }
+ storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
+ Builder.getInt64(I), Builder.getInt64(J), EltType,
+ Builder);
}
- storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
- Builder.getInt64(I), Builder.getInt64(J), EltType, Builder);
- }
+ }
// Mark eliminated instructions as fused and remove them.
FusedInsts.insert(Store);
@@ -1272,9 +1342,11 @@ public:
void LowerMatrixMultiplyFused(CallInst *MatMul,
SmallPtrSetImpl<Instruction *> &FusedInsts) {
if (!FuseMatrix || !MatMul->hasOneUse() ||
- MatrixLayout != MatrixLayoutTy::ColumnMajor)
+ MatrixLayout != MatrixLayoutTy::ColumnMajor || !DT)
return;
+ assert(AA && LI && "Analyses should be available");
+
auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
@@ -1283,7 +1355,7 @@ public:
// we create invalid IR.
// FIXME: See if we can hoist the store address computation.
auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
- if (AddrI && (!DT.dominates(AddrI, MatMul)))
+ if (AddrI && (!DT->dominates(AddrI, MatMul)))
return;
emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
@@ -1300,6 +1372,8 @@ public:
const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
+ assert(Lhs.getElementType() == Rhs.getElementType() &&
+ "Matrix multiply argument element types do not match.");
const unsigned R = LShape.NumRows;
const unsigned C = RShape.NumColumns;
@@ -1307,6 +1381,8 @@ public:
// Initialize the output
MatrixTy Result(R, C, EltType);
+ assert(Lhs.getElementType() == Result.getElementType() &&
+ "Matrix multiply result element type does not match arguments.");
bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
MatMul->hasAllowContract());
@@ -1424,6 +1500,40 @@ public:
return true;
}
+ /// Lower unary operators, if shape information is available.
+ bool VisitUnaryOperator(UnaryOperator *Inst) {
+ auto I = ShapeMap.find(Inst);
+ if (I == ShapeMap.end())
+ return false;
+
+ Value *Op = Inst->getOperand(0);
+
+ IRBuilder<> Builder(Inst);
+ ShapeInfo &Shape = I->second;
+
+ MatrixTy Result;
+ MatrixTy M = getMatrix(Op, Shape, Builder);
+
+ // Helper to perform unary op on vectors.
+ auto BuildVectorOp = [&Builder, Inst](Value *Op) {
+ switch (Inst->getOpcode()) {
+ case Instruction::FNeg:
+ return Builder.CreateFNeg(Op);
+ default:
+ llvm_unreachable("Unsupported unary operator for matrix");
+ }
+ };
+
+ for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+ Result.addVector(BuildVectorOp(M.getVector(I)));
+
+ finalizeLowering(Inst,
+ Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+ Result.getNumVectors()),
+ Builder);
+ return true;
+ }
+
/// Helper to linearize a matrix expression tree into a string. Currently
/// matrix expressions are linarized by starting at an expression leaf and
/// linearizing bottom up.
@@ -1488,7 +1598,7 @@ public:
if (Value *Ptr = getPointerOperand(V))
return getUnderlyingObjectThroughLoads(Ptr);
else if (V->getType()->isPointerTy())
- return GetUnderlyingObject(V, DL);
+ return getUnderlyingObject(V);
return V;
}
@@ -1524,7 +1634,7 @@ public:
write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
.drop_front(StringRef("llvm.matrix.").size()));
write(".");
- std::string Tmp = "";
+ std::string Tmp;
raw_string_ostream SS(Tmp);
switch (II->getIntrinsicID()) {
@@ -1737,7 +1847,6 @@ public:
for (Value *Op : cast<Instruction>(V)->operand_values())
collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
- return;
}
/// Calculate the number of exclusive and shared op counts for expression
@@ -1863,15 +1972,25 @@ public:
PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
+ OptimizationRemarkEmitter *ORE = nullptr;
+ AAResults *AA = nullptr;
+ DominatorTree *DT = nullptr;
+ LoopInfo *LI = nullptr;
+
+ if (!Minimal) {
+ ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ AA = &AM.getResult<AAManager>(F);
+ DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ LI = &AM.getResult<LoopAnalysis>(F);
+ }
LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
if (LMT.Visit()) {
PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
+ if (!Minimal) {
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ }
return PA;
}
return PreservedAnalyses::all();
@@ -1894,7 +2013,7 @@ public:
auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
+ LowerMatrixIntrinsics LMT(F, TTI, &AA, &DT, &LI, &ORE);
bool C = LMT.Visit();
return C;
}
@@ -1925,3 +2044,45 @@ INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
Pass *llvm::createLowerMatrixIntrinsicsPass() {
return new LowerMatrixIntrinsicsLegacyPass();
}
+
+namespace {
+
+/// A lightweight version of the matrix lowering pass that only requires TTI.
+/// Advanced features that require DT, AA or ORE like tiling are disabled. This
+/// is used to lower matrix intrinsics if the main lowering pass is not run, for
+/// example with -O0.
+class LowerMatrixIntrinsicsMinimalLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ LowerMatrixIntrinsicsMinimalLegacyPass() : FunctionPass(ID) {
+ initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ LowerMatrixIntrinsics LMT(F, TTI, nullptr, nullptr, nullptr, nullptr);
+ bool C = LMT.Visit();
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+} // namespace
+
+static const char pass_name_minimal[] = "Lower the matrix intrinsics (minimal)";
+char LowerMatrixIntrinsicsMinimalLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsMinimalLegacyPass,
+ "lower-matrix-intrinsics-minimal", pass_name_minimal,
+ false, false)
+INITIALIZE_PASS_END(LowerMatrixIntrinsicsMinimalLegacyPass,
+ "lower-matrix-intrinsics-minimal", pass_name_minimal, false,
+ false)
+
+Pass *llvm::createLowerMatrixIntrinsicsMinimalPass() {
+ return new LowerMatrixIntrinsicsMinimalLegacyPass();
+}
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 4b4196edc12b..a4e695497f30 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -21,8 +21,11 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
@@ -64,10 +67,15 @@ using namespace llvm;
#define DEBUG_TYPE "memcpyopt"
+static cl::opt<bool>
+ EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(false), cl::Hidden,
+ cl::desc("Use MemorySSA-backed MemCpyOpt."));
+
STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
STATISTIC(NumMemSetInfer, "Number of memsets inferred");
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
+STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
namespace {
@@ -271,11 +279,17 @@ private:
AU.setPreservesCFG();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ if (!EnableMemorySSA)
+ AU.addRequired<MemoryDependenceWrapperPass>();
AU.addPreserved<MemoryDependenceWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ if (EnableMemorySSA)
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
}
};
@@ -297,6 +311,56 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
false, false)
+// Check that V is either not accessible by the caller, or unwinding cannot
+// occur between Start and End.
+static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
+ Instruction *End) {
+ assert(Start->getParent() == End->getParent() && "Must be in same block");
+ if (!Start->getFunction()->doesNotThrow() &&
+ !isa<AllocaInst>(getUnderlyingObject(V))) {
+ for (const Instruction &I :
+ make_range(Start->getIterator(), End->getIterator())) {
+ if (I.mayThrow())
+ return true;
+ }
+ }
+ return false;
+}
+
+void MemCpyOptPass::eraseInstruction(Instruction *I) {
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I);
+ if (MD)
+ MD->removeInstruction(I);
+ I->eraseFromParent();
+}
+
+// Check for mod or ref of Loc between Start and End, excluding both boundaries.
+// Start and End must be in the same block
+static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
+ const MemoryUseOrDef *Start,
+ const MemoryUseOrDef *End) {
+ assert(Start->getBlock() == End->getBlock() && "Only local supported");
+ for (const MemoryAccess &MA :
+ make_range(++Start->getIterator(), End->getIterator())) {
+ if (isModOrRefSet(AA.getModRefInfo(cast<MemoryUseOrDef>(MA).getMemoryInst(),
+ Loc)))
+ return true;
+ }
+ return false;
+}
+
+// Check for mod of Loc between Start and End, excluding both boundaries.
+// Start and End can be in different blocks.
+static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
+ const MemoryUseOrDef *Start,
+ const MemoryUseOrDef *End) {
+ // TODO: Only walk until we hit Start.
+ MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+ End->getDefiningAccess(), Loc);
+ return !MSSA->dominates(Clobber, Start);
+}
+
/// When scanning forward over instructions, we look for some other patterns to
/// fold away. In particular, this looks for stores to neighboring locations of
/// memory. If it sees enough consecutive ones, it attempts to merge them
@@ -313,7 +377,27 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
MemsetRanges Ranges(DL);
BasicBlock::iterator BI(StartInst);
+
+ // Keeps track of the last memory use or def before the insertion point for
+ // the new memset. The new MemoryDef for the inserted memsets will be inserted
+ // after MemInsertPoint. It points to either LastMemDef or to the last user
+ // before the insertion point of the memset, if there are any such users.
+ MemoryUseOrDef *MemInsertPoint = nullptr;
+ // Keeps track of the last MemoryDef between StartInst and the insertion point
+ // for the new memset. This will become the defining access of the inserted
+ // memsets.
+ MemoryDef *LastMemDef = nullptr;
for (++BI; !BI->isTerminator(); ++BI) {
+ if (MSSAU) {
+ auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
+ MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
+ if (CurrentAcc) {
+ MemInsertPoint = CurrentAcc;
+ if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc))
+ LastMemDef = CurrentDef;
+ }
+ }
+
if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
// If the instruction is readnone, ignore it, otherwise bail out. We
// don't even allow readonly here because we don't want something like:
@@ -327,8 +411,15 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
// If this is a store, see if we can merge it in.
if (!NextStore->isSimple()) break;
+ Value *StoredVal = NextStore->getValueOperand();
+
+ // Don't convert stores of non-integral pointer types to memsets (which
+ // stores integers).
+ if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+ break;
+
// Check to see if this stored value is of the same byte-splattable value.
- Value *StoredByte = isBytewiseValue(NextStore->getOperand(0), DL);
+ Value *StoredByte = isBytewiseValue(StoredVal, DL);
if (isa<UndefValue>(ByteVal) && StoredByte)
ByteVal = StoredByte;
if (ByteVal != StoredByte)
@@ -392,15 +483,27 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
: Range.TheStores) dbgs()
<< *SI << '\n';
dbgs() << "With: " << *AMemSet << '\n');
-
if (!Range.TheStores.empty())
AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
- // Zap all the stores.
- for (Instruction *SI : Range.TheStores) {
- MD->removeInstruction(SI);
- SI->eraseFromParent();
+ if (MSSAU) {
+ assert(LastMemDef && MemInsertPoint &&
+ "Both LastMemDef and MemInsertPoint need to be set");
+ auto *NewDef =
+ cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
+ ? MSSAU->createMemoryAccessBefore(
+ AMemSet, LastMemDef, MemInsertPoint)
+ : MSSAU->createMemoryAccessAfter(
+ AMemSet, LastMemDef, MemInsertPoint));
+ MSSAU->insertDef(NewDef, /*RenameUses=*/true);
+ LastMemDef = NewDef;
+ MemInsertPoint = NewDef;
}
+
+ // Zap all the stores.
+ for (Instruction *SI : Range.TheStores)
+ eraseInstruction(SI);
+
++NumMemSetInfer;
}
@@ -411,11 +514,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
// It will lift the store and its argument + that anything that
// may alias with these.
// The method returns true if it was successful.
-static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
- const LoadInst *LI) {
+bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
// If the store alias this position, early bail out.
MemoryLocation StoreLoc = MemoryLocation::get(SI);
- if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc)))
+ if (isModOrRefSet(AA->getModRefInfo(P, StoreLoc)))
return false;
// Keep track of the arguments of all instruction we plan to lift
@@ -426,7 +528,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
Args.insert(Ptr);
// Instruction to lift before P.
- SmallVector<Instruction*, 8> ToLift;
+ SmallVector<Instruction *, 8> ToLift{SI};
// Memory locations of lifted instructions.
SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
@@ -439,19 +541,24 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
auto *C = &*I;
- bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None));
+ // Make sure hoisting does not perform a store that was not guaranteed to
+ // happen.
+ if (!isGuaranteedToTransferExecutionToSuccessor(C))
+ return false;
+
+ bool MayAlias = isModOrRefSet(AA->getModRefInfo(C, None));
bool NeedLift = false;
if (Args.erase(C))
NeedLift = true;
else if (MayAlias) {
- NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
- return isModOrRefSet(AA.getModRefInfo(C, ML));
+ NeedLift = llvm::any_of(MemLocs, [C, this](const MemoryLocation &ML) {
+ return isModOrRefSet(AA->getModRefInfo(C, ML));
});
if (!NeedLift)
- NeedLift = llvm::any_of(Calls, [C, &AA](const CallBase *Call) {
- return isModOrRefSet(AA.getModRefInfo(C, Call));
+ NeedLift = llvm::any_of(Calls, [C, this](const CallBase *Call) {
+ return isModOrRefSet(AA->getModRefInfo(C, Call));
});
}
@@ -461,18 +568,18 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
if (MayAlias) {
// Since LI is implicitly moved downwards past the lifted instructions,
// none of them may modify its source.
- if (isModSet(AA.getModRefInfo(C, LoadLoc)))
+ if (isModSet(AA->getModRefInfo(C, LoadLoc)))
return false;
else if (const auto *Call = dyn_cast<CallBase>(C)) {
// If we can't lift this before P, it's game over.
- if (isModOrRefSet(AA.getModRefInfo(P, Call)))
+ if (isModOrRefSet(AA->getModRefInfo(P, Call)))
return false;
Calls.push_back(Call);
} else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
// If we can't lift this before P, it's game over.
auto ML = MemoryLocation::get(C);
- if (isModOrRefSet(AA.getModRefInfo(P, ML)))
+ if (isModOrRefSet(AA->getModRefInfo(P, ML)))
return false;
MemLocs.push_back(ML);
@@ -492,10 +599,40 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
}
}
- // We made it, we need to lift
+ // Find MSSA insertion point. Normally P will always have a corresponding
+ // memory access before which we can insert. However, with non-standard AA
+ // pipelines, there may be a mismatch between AA and MSSA, in which case we
+ // will scan for a memory access before P. In either case, we know for sure
+ // that at least the load will have a memory access.
+ // TODO: Simplify this once P will be determined by MSSA, in which case the
+ // discrepancy can no longer occur.
+ MemoryUseOrDef *MemInsertPoint = nullptr;
+ if (MSSAU) {
+ if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) {
+ MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator());
+ } else {
+ const Instruction *ConstP = P;
+ for (const Instruction &I : make_range(++ConstP->getReverseIterator(),
+ ++LI->getReverseIterator())) {
+ if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+ MemInsertPoint = MA;
+ break;
+ }
+ }
+ }
+ }
+
+ // We made it, we need to lift.
for (auto *I : llvm::reverse(ToLift)) {
LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
I->moveBefore(P);
+ if (MSSAU) {
+ assert(MemInsertPoint && "Must have found insert point");
+ if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
+ MSSAU->moveAfter(MA, MemInsertPoint);
+ MemInsertPoint = MA;
+ }
+ }
}
return true;
@@ -515,23 +652,30 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
const DataLayout &DL = SI->getModule()->getDataLayout();
+ Value *StoredVal = SI->getValueOperand();
+
+ // Not all the transforms below are correct for non-integral pointers, bail
+ // until we've audited the individual pieces.
+ if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+ return false;
+
// Load to store forwarding can be interpreted as memcpy.
- if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
if (LI->isSimple() && LI->hasOneUse() &&
LI->getParent() == SI->getParent()) {
auto *T = LI->getType();
if (T->isAggregateType()) {
- AliasAnalysis &AA = LookupAliasAnalysis();
MemoryLocation LoadLoc = MemoryLocation::get(LI);
// We use alias analysis to check if an instruction may store to
// the memory we load from in between the load and the store. If
// such an instruction is found, we try to promote there instead
// of at the store position.
+ // TODO: Can use MSSA for this.
Instruction *P = SI;
for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
- if (isModSet(AA.getModRefInfo(&I, LoadLoc))) {
+ if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
P = &I;
break;
}
@@ -542,7 +686,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// position if nothing alias the store memory after this and the store
// destination is not in the range.
if (P && P != SI) {
- if (!moveUp(AA, SI, P, LI))
+ if (!moveUp(SI, P, LI))
P = nullptr;
}
@@ -553,7 +697,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// memmove must be used to preserve semantic. If not, memcpy can
// be used.
bool UseMemMove = false;
- if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+ if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc))
UseMemMove = true;
uint64_t Size = DL.getTypeStoreSize(T);
@@ -572,10 +716,16 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
<< *M << "\n");
- MD->removeInstruction(SI);
- SI->eraseFromParent();
- MD->removeInstruction(LI);
- LI->eraseFromParent();
+ if (MSSAU) {
+ auto *LastDef =
+ cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
+ auto *NewAccess =
+ MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+ }
+
+ eraseInstruction(SI);
+ eraseInstruction(LI);
++NumMemCpyInstr;
// Make sure we do not invalidate the iterator.
@@ -587,44 +737,50 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// Detect cases where we're performing call slot forwarding, but
// happen to be using a load-store pair to implement it, rather than
// a memcpy.
- MemDepResult ldep = MD->getDependency(LI);
CallInst *C = nullptr;
- if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
- C = dyn_cast<CallInst>(ldep.getInst());
+ if (EnableMemorySSA) {
+ if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
+ MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
+ // The load most post-dom the call. Limit to the same block for now.
+ // TODO: Support non-local call-slot optimization?
+ if (LoadClobber->getBlock() == SI->getParent())
+ C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
+ }
+ } else {
+ MemDepResult ldep = MD->getDependency(LI);
+ if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+ C = dyn_cast<CallInst>(ldep.getInst());
+ }
if (C) {
// Check that nothing touches the dest of the "copy" between
// the call and the store.
- Value *CpyDest = SI->getPointerOperand()->stripPointerCasts();
- bool CpyDestIsLocal = isa<AllocaInst>(CpyDest);
- AliasAnalysis &AA = LookupAliasAnalysis();
MemoryLocation StoreLoc = MemoryLocation::get(SI);
- for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
- I != E; --I) {
- if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) {
+ if (EnableMemorySSA) {
+ if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
+ MSSA->getMemoryAccess(SI)))
C = nullptr;
- break;
- }
- // The store to dest may never happen if an exception can be thrown
- // between the load and the store.
- if (I->mayThrow() && !CpyDestIsLocal) {
- C = nullptr;
- break;
+ } else {
+ for (BasicBlock::iterator I = --SI->getIterator(),
+ E = C->getIterator();
+ I != E; --I) {
+ if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
+ C = nullptr;
+ break;
+ }
}
}
}
if (C) {
bool changed = performCallSlotOptzn(
- LI, SI->getPointerOperand()->stripPointerCasts(),
+ LI, SI, SI->getPointerOperand()->stripPointerCasts(),
LI->getPointerOperand()->stripPointerCasts(),
DL.getTypeStoreSize(SI->getOperand(0)->getType()),
commonAlignment(SI->getAlign(), LI->getAlign()), C);
if (changed) {
- MD->removeInstruction(SI);
- SI->eraseFromParent();
- MD->removeInstruction(LI);
- LI->eraseFromParent();
+ eraseInstruction(SI);
+ eraseInstruction(LI);
++NumMemCpyInstr;
return true;
}
@@ -658,8 +814,15 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
- MD->removeInstruction(SI);
- SI->eraseFromParent();
+ if (MSSAU) {
+ assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)));
+ auto *LastDef =
+ cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
+ auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+ }
+
+ eraseInstruction(SI);
NumMemSetInfer++;
// Make sure we do not invalidate the iterator.
@@ -686,7 +849,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
/// Takes a memcpy and a call that it depends on,
/// and checks for the possibility of a call slot optimization by having
/// the call write its result directly into the destination of the memcpy.
-bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
+bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
+ Instruction *cpyStore, Value *cpyDest,
Value *cpySrc, uint64_t cpyLen,
Align cpyAlign, CallInst *C) {
// The general transformation to keep in mind is
@@ -717,7 +881,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
if (!srcArraySize)
return false;
- const DataLayout &DL = cpy->getModule()->getDataLayout();
+ const DataLayout &DL = cpyLoad->getModule()->getDataLayout();
uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
srcArraySize->getZExtValue();
@@ -727,43 +891,26 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
// Check that accessing the first srcSize bytes of dest will not cause a
// trap. Otherwise the transform is invalid since it might cause a trap
// to occur earlier than it otherwise would.
- if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
- // The destination is an alloca. Check it is larger than srcSize.
- ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
- if (!destArraySize)
- return false;
-
- uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
- destArraySize->getZExtValue();
-
- if (destSize < srcSize)
- return false;
- } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
- // The store to dest may never happen if the call can throw.
- if (C->mayThrow())
- return false;
-
- if (A->getDereferenceableBytes() < srcSize) {
- // If the destination is an sret parameter then only accesses that are
- // outside of the returned struct type can trap.
- if (!A->hasStructRetAttr())
- return false;
-
- Type *StructTy = cast<PointerType>(A->getType())->getElementType();
- if (!StructTy->isSized()) {
- // The call may never return and hence the copy-instruction may never
- // be executed, and therefore it's not safe to say "the destination
- // has at least <cpyLen> bytes, as implied by the copy-instruction",
- return false;
- }
+ if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpyLen),
+ DL, C, DT))
+ return false;
- uint64_t destSize = DL.getTypeAllocSize(StructTy);
- if (destSize < srcSize)
- return false;
- }
- } else {
+ // Make sure that nothing can observe cpyDest being written early. There are
+ // a number of cases to consider:
+ // 1. cpyDest cannot be accessed between C and cpyStore as a precondition of
+ // the transform.
+ // 2. C itself may not access cpyDest (prior to the transform). This is
+ // checked further below.
+ // 3. If cpyDest is accessible to the caller of this function (potentially
+ // captured and not based on an alloca), we need to ensure that we cannot
+ // unwind between C and cpyStore. This is checked here.
+ // 4. If cpyDest is potentially captured, there may be accesses to it from
+ // another thread. In this case, we need to check that cpyStore is
+ // guaranteed to be executed if C is. As it is a non-atomic access, it
+ // renders accesses from other threads undefined.
+ // TODO: This is currently not checked.
+ if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore))
return false;
- }
// Check that dest points to memory that is at least as aligned as src.
Align srcAlign = srcAlloca->getAlign();
@@ -777,29 +924,26 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
// guarantees that it holds only undefined values when passed in (so the final
// memcpy can be dropped), that it is not read or written between the call and
// the memcpy, and that writing beyond the end of it is undefined.
- SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
- srcAlloca->user_end());
+ SmallVector<User *, 8> srcUseList(srcAlloca->users());
while (!srcUseList.empty()) {
User *U = srcUseList.pop_back_val();
if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
- for (User *UU : U->users())
- srcUseList.push_back(UU);
+ append_range(srcUseList, U->users());
continue;
}
if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
if (!G->hasAllZeroIndices())
return false;
- for (User *UU : U->users())
- srcUseList.push_back(UU);
+ append_range(srcUseList, U->users());
continue;
}
if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
if (IT->isLifetimeStartOrEnd())
continue;
- if (U != C && U != cpy)
+ if (U != C && U != cpyLoad)
return false;
}
@@ -811,20 +955,24 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
// Since we're changing the parameter to the callsite, we need to make sure
// that what would be the new parameter dominates the callsite.
- DominatorTree &DT = LookupDomTree();
- if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
- if (!DT.dominates(cpyDestInst, C))
+ if (!DT->dominates(cpyDest, C)) {
+ // Support moving a constant index GEP before the call.
+ auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
+ if (GEP && GEP->hasAllConstantIndices() &&
+ DT->dominates(GEP->getPointerOperand(), C))
+ GEP->moveBefore(C);
+ else
return false;
+ }
// In addition to knowing that the call does not access src in some
// unexpected manner, for example via a global, which we deduce from
// the use analysis, we also need to know that it does not sneakily
// access dest. We rely on AA to figure this out for us.
- AliasAnalysis &AA = LookupAliasAnalysis();
- ModRefInfo MR = AA.getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
+ ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
// If necessary, perform additional analysis.
if (isModOrRefSet(MR))
- MR = AA.callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), &DT);
+ MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT);
if (isModOrRefSet(MR))
return false;
@@ -866,7 +1014,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
// Drop any cached information about the call, because we may have changed
// its dependence information by changing its parameter.
- MD->removeInstruction(C);
+ if (MD)
+ MD->removeInstruction(C);
// Update AA metadata
// FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
@@ -875,12 +1024,9 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
LLVMContext::MD_noalias,
LLVMContext::MD_invariant_group,
LLVMContext::MD_access_group};
- combineMetadata(C, cpy, KnownIDs, true);
-
- // Remove the memcpy.
- MD->removeInstruction(cpy);
- ++NumMemCpyInstr;
+ combineMetadata(C, cpyLoad, KnownIDs, true);
+ ++NumCallSlot;
return true;
}
@@ -908,8 +1054,6 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
return false;
- AliasAnalysis &AA = LookupAliasAnalysis();
-
// Verify that the copied-from memory doesn't change in between the two
// transfers. For example, in:
// memcpy(a <- b)
@@ -919,21 +1063,28 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
//
// TODO: If the code between M and MDep is transparent to the destination "c",
// then we could still perform the xform by moving M up to the first memcpy.
- //
- // NOTE: This is conservative, it will stop on any read from the source loc,
- // not just the defining memcpy.
- MemDepResult SourceDep =
- MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
- M->getIterator(), M->getParent());
- if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
- return false;
+ if (EnableMemorySSA) {
+ // TODO: It would be sufficient to check the MDep source up to the memcpy
+ // size of M, rather than MDep.
+ if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+ MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
+ return false;
+ } else {
+ // NOTE: This is conservative, it will stop on any read from the source loc,
+ // not just the defining memcpy.
+ MemDepResult SourceDep =
+ MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+ M->getIterator(), M->getParent());
+ if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+ return false;
+ }
// If the dest of the second might alias the source of the first, then the
// source and dest might overlap. We still want to eliminate the intermediate
// value, but we have to generate a memmove instead of memcpy.
bool UseMemMove = false;
- if (!AA.isNoAlias(MemoryLocation::getForDest(M),
- MemoryLocation::getForSource(MDep)))
+ if (!AA->isNoAlias(MemoryLocation::getForDest(M),
+ MemoryLocation::getForSource(MDep)))
UseMemMove = true;
// If all checks passed, then we can transform M.
@@ -943,18 +1094,25 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
// TODO: Is this worth it if we're creating a less aligned memcpy? For
// example we could be moving from movaps -> movq on x86.
IRBuilder<> Builder(M);
+ Instruction *NewM;
if (UseMemMove)
- Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
- MDep->getRawSource(), MDep->getSourceAlign(),
- M->getLength(), M->isVolatile());
+ NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
+ MDep->getRawSource(), MDep->getSourceAlign(),
+ M->getLength(), M->isVolatile());
else
- Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
- MDep->getRawSource(), MDep->getSourceAlign(),
- M->getLength(), M->isVolatile());
+ NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
+ MDep->getRawSource(), MDep->getSourceAlign(),
+ M->getLength(), M->isVolatile());
+
+ if (MSSAU) {
+ assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
+ auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+ auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+ }
// Remove the instruction we're replacing.
- MD->removeInstruction(M);
- M->eraseFromParent();
+ eraseInstruction(M);
++NumMemCpyInstr;
return true;
}
@@ -979,18 +1137,41 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
if (MemSet->getDest() != MemCpy->getDest())
return false;
- // Check that there are no other dependencies on the memset destination.
- MemDepResult DstDepInfo =
- MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
- MemCpy->getIterator(), MemCpy->getParent());
- if (DstDepInfo.getInst() != MemSet)
+ // Check that src and dst of the memcpy aren't the same. While memcpy
+ // operands cannot partially overlap, exact equality is allowed.
+ if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(),
+ LocationSize::precise(1)),
+ MemoryLocation(MemCpy->getDest(),
+ LocationSize::precise(1))))
return false;
+ if (EnableMemorySSA) {
+ // We know that dst up to src_size is not written. We now need to make sure
+ // that dst up to dst_size is not accessed. (If we did not move the memset,
+ // checking for reads would be sufficient.)
+ if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
+ MSSA->getMemoryAccess(MemSet),
+ MSSA->getMemoryAccess(MemCpy))) {
+ return false;
+ }
+ } else {
+ // We have already checked that dst up to src_size is not accessed. We
+ // need to make sure that there are no accesses up to dst_size either.
+ MemDepResult DstDepInfo = MD->getPointerDependencyFrom(
+ MemoryLocation::getForDest(MemSet), false, MemCpy->getIterator(),
+ MemCpy->getParent());
+ if (DstDepInfo.getInst() != MemSet)
+ return false;
+ }
+
// Use the same i8* dest as the memcpy, killing the memset dest if different.
Value *Dest = MemCpy->getRawDest();
Value *DestSize = MemSet->getLength();
Value *SrcSize = MemCpy->getLength();
+ if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy))
+ return false;
+
// By default, create an unaligned memset.
unsigned Align = 1;
// If Dest is aligned, and SrcSize is constant, use the minimum alignment
@@ -1016,13 +1197,25 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
Value *MemsetLen = Builder.CreateSelect(
Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
- Builder.CreateMemSet(
+ Instruction *NewMemSet = Builder.CreateMemSet(
Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest,
SrcSize),
MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
- MD->removeInstruction(MemSet);
- MemSet->eraseFromParent();
+ if (MSSAU) {
+ assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
+ "MemCpy must be a MemoryDef");
+ // The new memset is inserted after the memcpy, but it is known that its
+ // defining access is the memset about to be removed which immediately
+ // precedes the memcpy.
+ auto *LastDef =
+ cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
+ auto *NewAccess = MSSAU->createMemoryAccessBefore(
+ NewMemSet, LastDef->getDefiningAccess(), LastDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+ }
+
+ eraseInstruction(MemSet);
return true;
}
@@ -1041,6 +1234,24 @@ static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
return false;
}
+static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
+ MemoryDef *Def, ConstantInt *Size) {
+ if (MSSA->isLiveOnEntryDef(Def))
+ return isa<AllocaInst>(getUnderlyingObject(V));
+
+ if (IntrinsicInst *II =
+ dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+ ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0));
+ if (AA->isMustAlias(V, II->getArgOperand(1)) &&
+ LTSize->getZExtValue() >= Size->getZExtValue())
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// Transform memcpy to memset when its source was just memset.
/// In other words, turn:
/// \code
@@ -1057,11 +1268,9 @@ static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
/// The \p MemCpy must have a Constant length.
bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
MemSetInst *MemSet) {
- AliasAnalysis &AA = LookupAliasAnalysis();
-
// Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
// memcpying from the same address. Otherwise it is hard to reason about.
- if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
+ if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
return false;
// A known memset size is required.
@@ -1078,17 +1287,37 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
// interested in the bytes from MemSetSize..CopySize here, but as we can't
// easily represent this location, we use the full 0..CopySize range.
MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
- MemDepResult DepInfo = MD->getPointerDependencyFrom(
- MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
- if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
- CopySize = MemSetSize;
- else
+ bool CanReduceSize = false;
+ if (EnableMemorySSA) {
+ MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
+ MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+ MemSetAccess->getDefiningAccess(), MemCpyLoc);
+ if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+ if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize))
+ CanReduceSize = true;
+ } else {
+ MemDepResult DepInfo = MD->getPointerDependencyFrom(
+ MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
+ if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
+ CanReduceSize = true;
+ }
+
+ if (!CanReduceSize)
return false;
+ CopySize = MemSetSize;
}
IRBuilder<> Builder(MemCpy);
- Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize,
- MaybeAlign(MemCpy->getDestAlignment()));
+ Instruction *NewM =
+ Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+ CopySize, MaybeAlign(MemCpy->getDestAlignment()));
+ if (MSSAU) {
+ auto *LastDef =
+ cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
+ auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+ }
+
return true;
}
@@ -1104,8 +1333,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
// If the source and destination of the memcpy are the same, then zap it.
if (M->getSource() == M->getDest()) {
++BBI;
- MD->removeInstruction(M);
- M->eraseFromParent();
+ eraseInstruction(M);
return true;
}
@@ -1115,73 +1343,156 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
M->getModule()->getDataLayout())) {
IRBuilder<> Builder(M);
- Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
- MaybeAlign(M->getDestAlignment()), false);
- MD->removeInstruction(M);
- M->eraseFromParent();
+ Instruction *NewM =
+ Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+ MaybeAlign(M->getDestAlignment()), false);
+ if (MSSAU) {
+ auto *LastDef =
+ cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+ auto *NewAccess =
+ MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+ }
+
+ eraseInstruction(M);
++NumCpyToSet;
return true;
}
- MemDepResult DepInfo = MD->getDependency(M);
-
- // Try to turn a partially redundant memset + memcpy into
- // memcpy + smaller memset. We don't need the memcpy size for this.
- if (DepInfo.isClobber())
- if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
- if (processMemSetMemCpyDependence(M, MDep))
- return true;
+ if (EnableMemorySSA) {
+ MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+ MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+ MemoryLocation DestLoc = MemoryLocation::getForDest(M);
+ const MemoryAccess *DestClobber =
+ MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
+
+ // Try to turn a partially redundant memset + memcpy into
+ // memcpy + smaller memset. We don't need the memcpy size for this.
+ // The memcpy most post-dom the memset, so limit this to the same basic
+ // block. A non-local generalization is likely not worthwhile.
+ if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+ if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
+ if (DestClobber->getBlock() == M->getParent())
+ if (processMemSetMemCpyDependence(M, MDep))
+ return true;
+
+ // The optimizations after this point require the memcpy size.
+ ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+ if (!CopySize) return false;
+
+ MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
+ AnyClobber, MemoryLocation::getForSource(M));
+
+ // There are four possible optimizations we can do for memcpy:
+ // a) memcpy-memcpy xform which exposes redundance for DSE.
+ // b) call-memcpy xform for return slot optimization.
+ // c) memcpy from freshly alloca'd space or space that has just started
+ // its lifetime copies undefined data, and we can therefore eliminate
+ // the memcpy in favor of the data that was already at the destination.
+ // d) memcpy from a just-memset'd source can be turned into memset.
+ if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
+ if (Instruction *MI = MD->getMemoryInst()) {
+ if (auto *C = dyn_cast<CallInst>(MI)) {
+ // The memcpy must post-dom the call. Limit to the same block for now.
+ // Additionally, we need to ensure that there are no accesses to dest
+ // between the call and the memcpy. Accesses to src will be checked
+ // by performCallSlotOptzn().
+ // TODO: Support non-local call-slot optimization?
+ if (C->getParent() == M->getParent() &&
+ !accessedBetween(*AA, DestLoc, MD, MA)) {
+ // FIXME: Can we pass in either of dest/src alignment here instead
+ // of conservatively taking the minimum?
+ Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+ M->getSourceAlign().valueOrOne());
+ if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+ CopySize->getZExtValue(), Alignment, C)) {
+ LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
+ << " call: " << *C << "\n"
+ << " memcpy: " << *M << "\n");
+ eraseInstruction(M);
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+ }
+ if (auto *MDep = dyn_cast<MemCpyInst>(MI))
+ return processMemCpyMemCpyDependence(M, MDep);
+ if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
+ if (performMemCpyToMemSetOptzn(M, MDep)) {
+ LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
+ eraseInstruction(M);
+ ++NumCpyToSet;
+ return true;
+ }
+ }
+ }
- // The optimizations after this point require the memcpy size.
- ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
- if (!CopySize) return false;
-
- // There are four possible optimizations we can do for memcpy:
- // a) memcpy-memcpy xform which exposes redundance for DSE.
- // b) call-memcpy xform for return slot optimization.
- // c) memcpy from freshly alloca'd space or space that has just started its
- // lifetime copies undefined data, and we can therefore eliminate the
- // memcpy in favor of the data that was already at the destination.
- // d) memcpy from a just-memset'd source can be turned into memset.
- if (DepInfo.isClobber()) {
- if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
- // FIXME: Can we pass in either of dest/src alignment here instead
- // of conservatively taking the minimum?
- Align Alignment = std::min(M->getDestAlign().valueOrOne(),
- M->getSourceAlign().valueOrOne());
- if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
- CopySize->getZExtValue(), Alignment, C)) {
- MD->removeInstruction(M);
- M->eraseFromParent();
+ if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, CopySize)) {
+ LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
+ eraseInstruction(M);
+ ++NumMemCpyInstr;
return true;
}
}
- }
+ } else {
+ MemDepResult DepInfo = MD->getDependency(M);
- MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
- MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
- SrcLoc, true, M->getIterator(), M->getParent());
-
- if (SrcDepInfo.isClobber()) {
- if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
- return processMemCpyMemCpyDependence(M, MDep);
- } else if (SrcDepInfo.isDef()) {
- if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
- MD->removeInstruction(M);
- M->eraseFromParent();
- ++NumMemCpyInstr;
- return true;
+ // Try to turn a partially redundant memset + memcpy into
+ // memcpy + smaller memset. We don't need the memcpy size for this.
+ if (DepInfo.isClobber())
+ if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+ if (processMemSetMemCpyDependence(M, MDep))
+ return true;
+
+ // The optimizations after this point require the memcpy size.
+ ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+ if (!CopySize) return false;
+
+ // There are four possible optimizations we can do for memcpy:
+ // a) memcpy-memcpy xform which exposes redundance for DSE.
+ // b) call-memcpy xform for return slot optimization.
+ // c) memcpy from freshly alloca'd space or space that has just started
+ // its lifetime copies undefined data, and we can therefore eliminate
+ // the memcpy in favor of the data that was already at the destination.
+ // d) memcpy from a just-memset'd source can be turned into memset.
+ if (DepInfo.isClobber()) {
+ if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+ // FIXME: Can we pass in either of dest/src alignment here instead
+ // of conservatively taking the minimum?
+ Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+ M->getSourceAlign().valueOrOne());
+ if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+ CopySize->getZExtValue(), Alignment, C)) {
+ eraseInstruction(M);
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
}
- }
- if (SrcDepInfo.isClobber())
- if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
- if (performMemCpyToMemSetOptzn(M, MDep)) {
- MD->removeInstruction(M);
- M->eraseFromParent();
- ++NumCpyToSet;
+ MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+ MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+ SrcLoc, true, M->getIterator(), M->getParent());
+
+ if (SrcDepInfo.isClobber()) {
+ if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+ return processMemCpyMemCpyDependence(M, MDep);
+ } else if (SrcDepInfo.isDef()) {
+ if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
+ eraseInstruction(M);
+ ++NumMemCpyInstr;
return true;
}
+ }
+
+ if (SrcDepInfo.isClobber())
+ if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+ if (performMemCpyToMemSetOptzn(M, MDep)) {
+ eraseInstruction(M);
+ ++NumCpyToSet;
+ return true;
+ }
+ }
return false;
}
@@ -1189,14 +1500,12 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
/// not to alias.
bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
- AliasAnalysis &AA = LookupAliasAnalysis();
-
if (!TLI->has(LibFunc_memmove))
return false;
// See if the pointers alias.
- if (!AA.isNoAlias(MemoryLocation::getForDest(M),
- MemoryLocation::getForSource(M)))
+ if (!AA->isNoAlias(MemoryLocation::getForDest(M),
+ MemoryLocation::getForSource(M)))
return false;
LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
@@ -1209,9 +1518,13 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
Intrinsic::memcpy, ArgTys));
+ // For MemorySSA nothing really changes (except that memcpy may imply stricter
+ // aliasing guarantees).
+
// MemDep may have over conservative information about this instruction, just
// conservatively flush it from the cache.
- MD->removeInstruction(M);
+ if (MD)
+ MD->removeInstruction(M);
++NumMoveToCpy;
return true;
@@ -1224,16 +1537,25 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
Value *ByValArg = CB.getArgOperand(ArgNo);
Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
- MemDepResult DepInfo = MD->getPointerDependencyFrom(
- MemoryLocation(ByValArg, LocationSize::precise(ByValSize)), true,
- CB.getIterator(), CB.getParent());
- if (!DepInfo.isClobber())
- return false;
+ MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
+ MemCpyInst *MDep = nullptr;
+ if (EnableMemorySSA) {
+ MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
+ MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+ CallAccess->getDefiningAccess(), Loc);
+ if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+ MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
+ } else {
+ MemDepResult DepInfo = MD->getPointerDependencyFrom(
+ Loc, true, CB.getIterator(), CB.getParent());
+ if (!DepInfo.isClobber())
+ return false;
+ MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+ }
// If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
// a memcpy, see if we can byval from the source of the memcpy instead of the
// result.
- MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
if (!MDep || MDep->isVolatile() ||
ByValArg->stripPointerCasts() != MDep->getDest())
return false;
@@ -1250,12 +1572,10 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
// If it is greater than the memcpy, then we check to see if we can force the
// source of the memcpy to the alignment we need. If we fail, we bail out.
- AssumptionCache &AC = LookupAssumptionCache();
- DominatorTree &DT = LookupDomTree();
MaybeAlign MemDepAlign = MDep->getSourceAlign();
if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
- getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, &AC,
- &DT) < *ByValAlign)
+ getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC,
+ DT) < *ByValAlign)
return false;
// The address space of the memcpy source must match the byval argument
@@ -1269,14 +1589,19 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
// *b = 42;
// foo(*a)
// It would be invalid to transform the second memcpy into foo(*b).
- //
- // NOTE: This is conservative, it will stop on any read from the source loc,
- // not just the defining memcpy.
- MemDepResult SourceDep = MD->getPointerDependencyFrom(
- MemoryLocation::getForSource(MDep), false,
- CB.getIterator(), MDep->getParent());
- if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
- return false;
+ if (EnableMemorySSA) {
+ if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+ MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
+ return false;
+ } else {
+ // NOTE: This is conservative, it will stop on any read from the source loc,
+ // not just the defining memcpy.
+ MemDepResult SourceDep = MD->getPointerDependencyFrom(
+ MemoryLocation::getForSource(MDep), false,
+ CB.getIterator(), MDep->getParent());
+ if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+ return false;
+ }
Value *TmpCast = MDep->getSource();
if (MDep->getSource()->getType() != ByValArg->getType()) {
@@ -1301,15 +1626,13 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
bool MemCpyOptPass::iterateOnFunction(Function &F) {
bool MadeChange = false;
- DominatorTree &DT = LookupDomTree();
-
// Walk all instruction in the function.
for (BasicBlock &BB : F) {
// Skip unreachable blocks. For example processStore assumes that an
// instruction in a BB can't be dominated by a later instruction in the
// same BB (which is a scenario that can happen for an unreachable BB that
// has itself as a predecessor).
- if (!DT.isReachableFromEntry(&BB))
+ if (!DT->isReachableFromEntry(&BB))
continue;
for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
@@ -1345,43 +1668,43 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
}
PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+ auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
+ : AM.getCachedResult<MemoryDependenceAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-
- auto LookupAliasAnalysis = [&]() -> AliasAnalysis & {
- return AM.getResult<AAManager>(F);
- };
- auto LookupAssumptionCache = [&]() -> AssumptionCache & {
- return AM.getResult<AssumptionAnalysis>(F);
- };
- auto LookupDomTree = [&]() -> DominatorTree & {
- return AM.getResult<DominatorTreeAnalysis>(F);
- };
-
- bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis,
- LookupAssumptionCache, LookupDomTree);
+ auto *AA = &AM.getResult<AAManager>(F);
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
+ : AM.getCachedResult<MemorySSAAnalysis>(F);
+
+ bool MadeChange =
+ runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
if (!MadeChange)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
- PA.preserve<MemoryDependenceAnalysis>();
+ if (MD)
+ PA.preserve<MemoryDependenceAnalysis>();
+ if (MSSA)
+ PA.preserve<MemorySSAAnalysis>();
return PA;
}
-bool MemCpyOptPass::runImpl(
- Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_,
- std::function<AliasAnalysis &()> LookupAliasAnalysis_,
- std::function<AssumptionCache &()> LookupAssumptionCache_,
- std::function<DominatorTree &()> LookupDomTree_) {
+bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
+ TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+ AssumptionCache *AC_, DominatorTree *DT_,
+ MemorySSA *MSSA_) {
bool MadeChange = false;
MD = MD_;
TLI = TLI_;
- LookupAliasAnalysis = std::move(LookupAliasAnalysis_);
- LookupAssumptionCache = std::move(LookupAssumptionCache_);
- LookupDomTree = std::move(LookupDomTree_);
-
+ AA = AA_;
+ AC = AC_;
+ DT = DT_;
+ MSSA = MSSA_;
+ MemorySSAUpdater MSSAU_(MSSA_);
+ MSSAU = MSSA_ ? &MSSAU_ : nullptr;
// If we don't have at least memset and memcpy, there is little point of doing
// anything here. These are required by a freestanding implementation, so if
// even they are disabled, there is no point in trying hard.
@@ -1394,6 +1717,9 @@ bool MemCpyOptPass::runImpl(
MadeChange = true;
}
+ if (MSSA_ && VerifyMemorySSA)
+ MSSA_->verifyMemorySSA();
+
MD = nullptr;
return MadeChange;
}
@@ -1403,19 +1729,17 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ auto *MDWP = !EnableMemorySSA
+ ? &getAnalysis<MemoryDependenceWrapperPass>()
+ : getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
- auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
- return getAnalysis<AAResultsWrapperPass>().getAAResults();
- };
- auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & {
- return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- };
- auto LookupDomTree = [this]() -> DominatorTree & {
- return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- };
-
- return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache,
- LookupDomTree);
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *MSSAWP = EnableMemorySSA
+ ? &getAnalysis<MemorySSAWrapperPass>()
+ : getAnalysisIfAvailable<MemorySSAWrapperPass>();
+
+ return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
+ MSSAWP ? &MSSAWP->getMSSA() : nullptr);
}
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index ce1e142101b8..7f8b75ac8806 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -372,7 +372,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
} else {
// In this case, we expect a constant incoming value (the comparison is
// chained).
- const auto *const Const = dyn_cast<ConstantInt>(Val);
+ const auto *const Const = cast<ConstantInt>(Val);
LLVM_DEBUG(dbgs() << "const\n");
if (!Const->isZero()) return {};
LLVM_DEBUG(dbgs() << "false\n");
@@ -624,6 +624,17 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
Value *IsEqual = nullptr;
LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
<< BB->getName() << "\n");
+
+ // If there is one block that requires splitting, we do it now, i.e.
+ // just before we know we will collapse the chain. The instructions
+ // can be executed before any of the instructions in the chain.
+ const auto ToSplit = llvm::find_if(
+ Comparisons, [](const BCECmpBlock &B) { return B.RequireSplit; });
+ if (ToSplit != Comparisons.end()) {
+ LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
+ ToSplit->split(BB, AA);
+ }
+
if (Comparisons.size() == 1) {
LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
Value *const LhsLoad =
@@ -633,17 +644,6 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
// There are no blocks to merge, just do the comparison.
IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
} else {
- // If there is one block that requires splitting, we do it now, i.e.
- // just before we know we will collapse the chain. The instructions
- // can be executed before any of the instructions in the chain.
- const auto ToSplit =
- std::find_if(Comparisons.begin(), Comparisons.end(),
- [](const BCECmpBlock &B) { return B.RequireSplit; });
- if (ToSplit != Comparisons.end()) {
- LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
- ToSplit->split(BB, AA);
- }
-
const unsigned TotalSizeBits = std::accumulate(
Comparisons.begin(), Comparisons.end(), 0u,
[](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 4e010f8704d0..32bb62129e8f 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -213,54 +213,33 @@ bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
return Changed;
}
-// Explicitly list the instruction types NaryReassociate handles for now.
-static bool isPotentiallyNaryReassociable(Instruction *I) {
- switch (I->getOpcode()) {
- case Instruction::Add:
- case Instruction::GetElementPtr:
- case Instruction::Mul:
- return true;
- default:
- return false;
- }
-}
-
bool NaryReassociatePass::doOneIteration(Function &F) {
bool Changed = false;
SeenExprs.clear();
// Process the basic blocks in a depth first traversal of the dominator
// tree. This order ensures that all bases of a candidate are in Candidates
// when we process it.
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
for (const auto Node : depth_first(DT)) {
BasicBlock *BB = Node->getBlock();
for (auto I = BB->begin(); I != BB->end(); ++I) {
- if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
- const SCEV *OldSCEV = SE->getSCEV(&*I);
- if (Instruction *NewI = tryReassociate(&*I)) {
- Changed = true;
- SE->forgetValue(&*I);
- I->replaceAllUsesWith(NewI);
- WeakVH NewIExist = NewI;
- // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that
- // entry will be replaced with nullptr if deleted.
- RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
- if (!NewIExist) {
- // Rare occation where the new instruction (NewI) have been removed,
- // probably due to parts of the input code was dead from the
- // beginning, reset the iterator and start over from the beginning
- I = BB->begin();
- continue;
- }
- I = NewI->getIterator();
- }
- // Add the rewritten instruction to SeenExprs; the original instruction
- // is deleted.
- const SCEV *NewSCEV = SE->getSCEV(&*I);
- SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I));
+ Instruction *OrigI = &*I;
+ const SCEV *OrigSCEV = nullptr;
+ if (Instruction *NewI = tryReassociate(OrigI, OrigSCEV)) {
+ Changed = true;
+ OrigI->replaceAllUsesWith(NewI);
+
+ // Add 'OrigI' to the list of dead instructions.
+ DeadInsts.push_back(WeakTrackingVH(OrigI));
+ // Add the rewritten instruction to SeenExprs; the original
+ // instruction is deleted.
+ const SCEV *NewSCEV = SE->getSCEV(NewI);
+ SeenExprs[NewSCEV].push_back(WeakTrackingVH(NewI));
+
// Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
// is equivalent to I. However, ScalarEvolution::getSCEV may
- // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
- // we reassociate
+ // weaken nsw causing NewSCEV not to equal OldSCEV. For example,
+ // suppose we reassociate
// I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
// to
// NewI = &a[sext(i)] + sext(j).
@@ -274,32 +253,47 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
// equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
// map both SCEV before and after tryReassociate(I) to I.
//
- // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
- if (NewSCEV != OldSCEV)
- SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I));
- }
+ // This improvement is exercised in @reassociate_gep_nsw in
+ // nary-gep.ll.
+ if (NewSCEV != OrigSCEV)
+ SeenExprs[OrigSCEV].push_back(WeakTrackingVH(NewI));
+ } else if (OrigSCEV)
+ SeenExprs[OrigSCEV].push_back(WeakTrackingVH(OrigI));
}
}
+ // Delete all dead instructions from 'DeadInsts'.
+ // Please note ScalarEvolution is updated along the way.
+ RecursivelyDeleteTriviallyDeadInstructionsPermissive(
+ DeadInsts, TLI, nullptr, [this](Value *V) { SE->forgetValue(V); });
+
return Changed;
}
-Instruction *NaryReassociatePass::tryReassociate(Instruction *I) {
+Instruction *NaryReassociatePass::tryReassociate(Instruction * I,
+ const SCEV *&OrigSCEV) {
+
+ if (!SE->isSCEVable(I->getType()))
+ return nullptr;
+
switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Mul:
+ OrigSCEV = SE->getSCEV(I);
return tryReassociateBinaryOp(cast<BinaryOperator>(I));
case Instruction::GetElementPtr:
+ OrigSCEV = SE->getSCEV(I);
return tryReassociateGEP(cast<GetElementPtrInst>(I));
default:
- llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable");
+ return nullptr;
}
+
+ llvm_unreachable("should not be reached");
+ return nullptr;
}
static bool isGEPFoldable(GetElementPtrInst *GEP,
const TargetTransformInfo *TTI) {
- SmallVector<const Value*, 4> Indices;
- for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
- Indices.push_back(*I);
+ SmallVector<const Value *, 4> Indices(GEP->indices());
return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
Indices) == TargetTransformInfo::TCC_Free;
}
@@ -375,8 +369,8 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
// Replace the I-th index with LHS.
IndexExprs[I] = SE->getSCEV(LHS);
if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
- DL->getTypeSizeInBits(LHS->getType()) <
- DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
+ DL->getTypeSizeInBits(LHS->getType()).getFixedSize() <
+ DL->getTypeSizeInBits(GEP->getOperand(I)->getType()).getFixedSize()) {
// Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
// zext if the source operand is proved non-negative. We should do that
// consistently so that CandidateExpr more likely appears before. See
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 0ed1773373a7..281d47c8625f 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -662,7 +662,8 @@ public:
const DataLayout &DL)
: F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL),
PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
- SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {}
+ SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false,
+ /*CanUseUndef=*/false) {}
bool runGVN();
@@ -800,12 +801,7 @@ private:
Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
const BasicBlock *) const;
- // New instruction creation.
- void handleNewInstruction(Instruction *) {}
-
// Various instruction touch utilities
- template <typename Map, typename KeyType, typename Func>
- void for_each_found(Map &, const KeyType &, Func);
template <typename Map, typename KeyType>
void touchAndErase(Map &, const KeyType &);
void markUsersTouched(Value *);
@@ -834,7 +830,6 @@ private:
BasicBlock *getBlockForValue(Value *V) const;
void deleteExpression(const Expression *E) const;
MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
- MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
MemoryPhi *getMemoryAccess(const BasicBlock *) const;
template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
@@ -1253,6 +1248,7 @@ const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
const CallExpression *
NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
// FIXME: Add operand bundles for calls.
+ // FIXME: Allow commutative matching for intrinsics.
auto *E =
new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
setBasicExpressionInfo(CI, E);
@@ -1539,90 +1535,39 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
- auto *PWC = dyn_cast<PredicateWithCondition>(PI);
- if (!PWC)
+ const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
+ if (!Constraint)
return nullptr;
- auto *CopyOf = I->getOperand(0);
- auto *Cond = PWC->Condition;
-
- // If this a copy of the condition, it must be either true or false depending
- // on the predicate info type and edge.
- if (CopyOf == Cond) {
- // We should not need to add predicate users because the predicate info is
- // already a use of this operand.
- if (isa<PredicateAssume>(PI))
- return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
- if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
- if (PBranch->TrueEdge)
- return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
- return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
- }
- if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
- return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
- }
+ CmpInst::Predicate Predicate = Constraint->Predicate;
+ Value *CmpOp0 = I->getOperand(0);
+ Value *CmpOp1 = Constraint->OtherOp;
- // Not a copy of the condition, so see what the predicates tell us about this
- // value. First, though, we check to make sure the value is actually a copy
- // of one of the condition operands. It's possible, in certain cases, for it
- // to be a copy of a predicateinfo copy. In particular, if two branch
- // operations use the same condition, and one branch dominates the other, we
- // will end up with a copy of a copy. This is currently a small deficiency in
- // predicateinfo. What will end up happening here is that we will value
- // number both copies the same anyway.
-
- // Everything below relies on the condition being a comparison.
- auto *Cmp = dyn_cast<CmpInst>(Cond);
- if (!Cmp)
- return nullptr;
+ Value *FirstOp = lookupOperandLeader(CmpOp0);
+ Value *SecondOp = lookupOperandLeader(CmpOp1);
+ Value *AdditionallyUsedValue = CmpOp0;
- if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
- LLVM_DEBUG(dbgs() << "Copy is not of any condition operands!\n");
- return nullptr;
- }
- Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
- Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
- bool SwappedOps = false;
// Sort the ops.
if (shouldSwapOperands(FirstOp, SecondOp)) {
std::swap(FirstOp, SecondOp);
- SwappedOps = true;
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ AdditionallyUsedValue = CmpOp1;
}
- CmpInst::Predicate Predicate =
- SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
-
- if (isa<PredicateAssume>(PI)) {
- // If we assume the operands are equal, then they are equal.
- if (Predicate == CmpInst::ICMP_EQ) {
- addPredicateUsers(PI, I);
- addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
- I);
- return createVariableOrConstant(FirstOp);
- }
+
+ if (Predicate == CmpInst::ICMP_EQ) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(AdditionallyUsedValue, I);
+ return createVariableOrConstant(FirstOp);
}
- if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
- // If we are *not* a copy of the comparison, we may equal to the other
- // operand when the predicate implies something about equality of
- // operations. In particular, if the comparison is true/false when the
- // operands are equal, and we are on the right edge, we know this operation
- // is equal to something.
- if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
- (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
- addPredicateUsers(PI, I);
- addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
- I);
- return createVariableOrConstant(FirstOp);
- }
- // Handle the special case of floating point.
- if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
- (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
- isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
- addPredicateUsers(PI, I);
- addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
- I);
- return createConstantExpression(cast<Constant>(FirstOp));
- }
+
+ // Handle the special case of floating point.
+ if (Predicate == CmpInst::FCMP_OEQ && isa<ConstantFP>(FirstOp) &&
+ !cast<ConstantFP>(FirstOp)->isZero()) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(AdditionallyUsedValue, I);
+ return createConstantExpression(cast<Constant>(FirstOp));
}
+
return nullptr;
}
@@ -2044,16 +1989,6 @@ NewGVN::performSymbolicEvaluation(Value *V,
return E;
}
-// Look up a container in a map, and then call a function for each thing in the
-// found container.
-template <typename Map, typename KeyType, typename Func>
-void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
- const auto Result = M.find_as(Key);
- if (Result != M.end())
- for (typename Map::mapped_type::value_type Mapped : Result->second)
- F(Mapped);
-}
-
// Look up a container of values/instructions in a map, and touch all the
// instructions in the container. Then erase value from the map.
template <typename Map, typename KeyType>
@@ -2941,8 +2876,7 @@ void NewGVN::cleanupTables() {
}
while (!TempInst.empty()) {
- auto *I = TempInst.back();
- TempInst.pop_back();
+ auto *I = TempInst.pop_back_val();
I->deleteValue();
}
@@ -3437,10 +3371,9 @@ bool NewGVN::runGVN() {
for (auto &B : RPOT) {
auto *Node = DT->getNode(B);
if (Node->getNumChildren() > 1)
- llvm::sort(Node->begin(), Node->end(),
- [&](const DomTreeNode *A, const DomTreeNode *B) {
- return RPOOrdering[A] < RPOOrdering[B];
- });
+ llvm::sort(*Node, [&](const DomTreeNode *A, const DomTreeNode *B) {
+ return RPOOrdering[A] < RPOOrdering[B];
+ });
}
// Now a standard depth first ordering of the domtree is equivalent to RPO.
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 4553b23532f2..a110f7d5c241 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -243,7 +243,7 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
BasicBlock *Pred) {
// A conservative bound on the loop as a whole.
const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
- if (MaxTrips != SE->getCouldNotCompute() &&
+ if (!isa<SCEVCouldNotCompute>(MaxTrips) &&
SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
CountedLoopTripWidth))
return true;
@@ -255,7 +255,7 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
// This returns an exact expression only. TODO: We really only need an
// upper bound here, but SE doesn't expose that.
const SCEV *MaxExec = SE->getExitCount(L, Pred);
- if (MaxExec != SE->getCouldNotCompute() &&
+ if (!isa<SCEVCouldNotCompute>(MaxExec) &&
SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
CountedLoopTripWidth))
return true;
@@ -435,7 +435,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
return Cursor;
}
-static const char *const GCSafepointPollName = "gc.safepoint_poll";
+const char GCSafepointPollName[] = "gc.safepoint_poll";
static bool isGCSafepointPoll(Function &F) {
return F.getName().equals(GCSafepointPollName);
@@ -589,8 +589,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
for (Instruction *PollLocation : PollsNeeded) {
std::vector<CallBase *> RuntimeCalls;
InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
- ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
- RuntimeCalls.end());
+ llvm::append_range(ParsePointNeeded, RuntimeCalls);
}
return Modified;
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index ba7f367267fe..dffeb7cc227b 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -920,6 +920,100 @@ static Value *NegateValue(Value *V, Instruction *BI,
return NewNeg;
}
+// See if this `or` looks like an load widening reduction, i.e. that it
+// consists of an `or`/`shl`/`zext`/`load` nodes only. Note that we don't
+// ensure that the pattern is *really* a load widening reduction,
+// we do not ensure that it can really be replaced with a widened load,
+// only that it mostly looks like one.
+static bool isLoadCombineCandidate(Instruction *Or) {
+ SmallVector<Instruction *, 8> Worklist;
+ SmallSet<Instruction *, 8> Visited;
+
+ auto Enqueue = [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ // Each node of an `or` reduction must be an instruction,
+ if (!I)
+ return false; // Node is certainly not part of an `or` load reduction.
+ // Only process instructions we have never processed before.
+ if (Visited.insert(I).second)
+ Worklist.emplace_back(I);
+ return true; // Will need to look at parent nodes.
+ };
+
+ if (!Enqueue(Or))
+ return false; // Not an `or` reduction pattern.
+
+ while (!Worklist.empty()) {
+ auto *I = Worklist.pop_back_val();
+
+ // Okay, which instruction is this node?
+ switch (I->getOpcode()) {
+ case Instruction::Or:
+ // Got an `or` node. That's fine, just recurse into it's operands.
+ for (Value *Op : I->operands())
+ if (!Enqueue(Op))
+ return false; // Not an `or` reduction pattern.
+ continue;
+
+ case Instruction::Shl:
+ case Instruction::ZExt:
+ // `shl`/`zext` nodes are fine, just recurse into their base operand.
+ if (!Enqueue(I->getOperand(0)))
+ return false; // Not an `or` reduction pattern.
+ continue;
+
+ case Instruction::Load:
+ // Perfect, `load` node means we've reached an edge of the graph.
+ continue;
+
+ default: // Unknown node.
+ return false; // Not an `or` reduction pattern.
+ }
+ }
+
+ return true;
+}
+
+/// Return true if it may be profitable to convert this (X|Y) into (X+Y).
+static bool ShouldConvertOrWithNoCommonBitsToAdd(Instruction *Or) {
+ // Don't bother to convert this up unless either the LHS is an associable add
+ // or subtract or mul or if this is only used by one of the above.
+ // This is only a compile-time improvement, it is not needed for correctness!
+ auto isInteresting = [](Value *V) {
+ for (auto Op : {Instruction::Add, Instruction::Sub, Instruction::Mul})
+ if (isReassociableOp(V, Op))
+ return true;
+ return false;
+ };
+
+ if (any_of(Or->operands(), isInteresting))
+ return true;
+
+ Value *VB = Or->user_back();
+ if (Or->hasOneUse() && isInteresting(VB))
+ return true;
+
+ return false;
+}
+
+/// If we have (X|Y), and iff X and Y have no common bits set,
+/// transform this into (X+Y) to allow arithmetics reassociation.
+static BinaryOperator *ConvertOrWithNoCommonBitsToAdd(Instruction *Or) {
+ // Convert an or into an add.
+ BinaryOperator *New =
+ CreateAdd(Or->getOperand(0), Or->getOperand(1), "", Or, Or);
+ New->setHasNoSignedWrap();
+ New->setHasNoUnsignedWrap();
+ New->takeName(Or);
+
+ // Everyone now refers to the add instruction.
+ Or->replaceAllUsesWith(New);
+ New->setDebugLoc(Or->getDebugLoc());
+
+ LLVM_DEBUG(dbgs() << "Converted or into an add: " << *New << '\n');
+ return New;
+}
+
/// Return true if we should break up this subtract of X-Y into (X + -Y).
static bool ShouldBreakUpSubtract(Instruction *Sub) {
// If this is a negation, we can't split it up!
@@ -1034,8 +1128,7 @@ static Value *EmitAddTreeOfValues(Instruction *I,
SmallVectorImpl<WeakTrackingVH> &Ops) {
if (Ops.size() == 1) return Ops.back();
- Value *V1 = Ops.back();
- Ops.pop_back();
+ Value *V1 = Ops.pop_back_val();
Value *V2 = EmitAddTreeOfValues(I, Ops);
return CreateAdd(V2, V1, "reass.add", I, I);
}
@@ -1899,7 +1992,7 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
OrderedSet &Insts) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
- SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+ SmallVector<Value *, 4> Ops(I->operands());
ValueRankMap.erase(I);
Insts.remove(I);
RedoInsts.remove(I);
@@ -1916,7 +2009,7 @@ void ReassociatePass::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
- SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+ SmallVector<Value *, 8> Ops(I->operands());
// Erase the dead instruction.
ValueRankMap.erase(I);
RedoInsts.remove(I);
@@ -2116,6 +2209,19 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
if (I->getType()->isIntegerTy(1))
return;
+ // If this is a bitwise or instruction of operands
+ // with no common bits set, convert it to X+Y.
+ if (I->getOpcode() == Instruction::Or &&
+ ShouldConvertOrWithNoCommonBitsToAdd(I) && !isLoadCombineCandidate(I) &&
+ haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1),
+ I->getModule()->getDataLayout(), /*AC=*/nullptr, I,
+ /*DT=*/nullptr)) {
+ Instruction *NI = ConvertOrWithNoCommonBitsToAdd(I);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+
// If this is a subtract instruction which is not already in negate form,
// see if we can convert it to X+-Y.
if (I->getOpcode() == Instruction::Sub) {
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 0716c1320982..a49b9ad3f62b 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -15,17 +15,23 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/Reg2Mem.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include <list>
using namespace llvm;
@@ -35,43 +41,17 @@ using namespace llvm;
STATISTIC(NumRegsDemoted, "Number of registers demoted");
STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
-namespace {
- struct RegToMem : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- RegToMem() : FunctionPass(ID) {
- initializeRegToMemPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(BreakCriticalEdgesID);
- AU.addPreservedID(BreakCriticalEdgesID);
- }
-
- bool valueEscapes(const Instruction *Inst) const {
- const BasicBlock *BB = Inst->getParent();
- for (const User *U : Inst->users()) {
- const Instruction *UI = cast<Instruction>(U);
- if (UI->getParent() != BB || isa<PHINode>(UI))
- return true;
- }
- return false;
- }
-
- bool runOnFunction(Function &F) override;
- };
+static bool valueEscapes(const Instruction &Inst) {
+ const BasicBlock *BB = Inst.getParent();
+ for (const User *U : Inst.users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (UI->getParent() != BB || isa<PHINode>(UI))
+ return true;
+ }
+ return false;
}
-char RegToMem::ID = 0;
-INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
-INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
- false, false)
-
-bool RegToMem::runOnFunction(Function &F) {
- if (F.isDeclaration() || skipFunction(F))
- return false;
-
+static bool runPass(Function &F) {
// Insert all new allocas into entry block.
BasicBlock *BBEntry = &F.getEntryBlock();
assert(pred_empty(BBEntry) &&
@@ -90,40 +70,72 @@ bool RegToMem::runOnFunction(Function &F) {
// Find the escaped instructions. But don't create stack slots for
// allocas in entry block.
std::list<Instruction*> WorkList;
- for (BasicBlock &ibb : F)
- for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
- ++iib) {
- if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
- valueEscapes(&*iib)) {
- WorkList.push_front(&*iib);
- }
- }
+ for (Instruction &I : instructions(F))
+ if (!(isa<AllocaInst>(I) && I.getParent() == BBEntry) && valueEscapes(I))
+ WorkList.push_front(&I);
// Demote escaped instructions
NumRegsDemoted += WorkList.size();
- for (Instruction *ilb : WorkList)
- DemoteRegToStack(*ilb, false, AllocaInsertionPoint);
+ for (Instruction *I : WorkList)
+ DemoteRegToStack(*I, false, AllocaInsertionPoint);
WorkList.clear();
// Find all phi's
- for (BasicBlock &ibb : F)
- for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
- ++iib)
- if (isa<PHINode>(iib))
- WorkList.push_front(&*iib);
+ for (BasicBlock &BB : F)
+ for (auto &Phi : BB.phis())
+ WorkList.push_front(&Phi);
// Demote phi nodes
NumPhisDemoted += WorkList.size();
- for (Instruction *ilb : WorkList)
- DemotePHIToStack(cast<PHINode>(ilb), AllocaInsertionPoint);
+ for (Instruction *I : WorkList)
+ DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint);
return true;
}
+PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *LI = &AM.getResult<LoopAnalysis>(F);
+ unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+ bool Changed = runPass(F);
+ if (N == 0 && !Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+namespace {
+struct RegToMemLegacy : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ RegToMemLegacy() : FunctionPass(ID) {
+ initializeRegToMemLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(BreakCriticalEdgesID);
+ AU.addPreservedID(BreakCriticalEdgesID);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (F.isDeclaration() || skipFunction(F))
+ return false;
+ return runPass(F);
+ }
+};
+} // namespace
+
+char RegToMemLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMemLegacy, "reg2mem",
+ "Demote all values to stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMemLegacy, "reg2mem",
+ "Demote all values to stack slots", false, false)
// createDemoteRegisterToMemory - Provide an entry point to create this pass.
-char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+char &llvm::DemoteRegisterToMemoryID = RegToMemLegacy::ID;
FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
- return new RegToMem();
+ return new RegToMemLegacy();
}
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index dc2ad14ae61e..b7830555bf73 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1487,7 +1487,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
uint32_t NumPatchBytes = 0;
uint32_t Flags = uint32_t(StatepointFlags::None);
- ArrayRef<Use> CallArgs(Call->arg_begin(), Call->arg_end());
+ SmallVector<Value *, 8> CallArgs(Call->args());
Optional<ArrayRef<Use>> DeoptArgs;
if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
DeoptArgs = Bundle->Inputs;
@@ -1520,7 +1520,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
Value *CallTarget = Call->getCalledOperand();
if (Function *F = dyn_cast<Function>(CallTarget)) {
- if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
+ auto IID = F->getIntrinsicID();
+ if (IID == Intrinsic::experimental_deoptimize) {
// Calls to llvm.experimental.deoptimize are lowered to calls to the
// __llvm_deoptimize symbol. We want to resolve this now, since the
// verifier does not allow taking the address of an intrinsic function.
@@ -1540,6 +1541,101 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
.getCallee();
IsDeoptimize = true;
+ } else if (IID == Intrinsic::memcpy_element_unordered_atomic ||
+ IID == Intrinsic::memmove_element_unordered_atomic) {
+ // Unordered atomic memcpy and memmove intrinsics which are not explicitly
+ // marked as "gc-leaf-function" should be lowered in a GC parseable way.
+ // Specifically, these calls should be lowered to the
+ // __llvm_{memcpy|memmove}_element_unordered_atomic_safepoint symbols.
+ // Similarly to __llvm_deoptimize we want to resolve this now, since the
+ // verifier does not allow taking the address of an intrinsic function.
+ //
+ // Moreover we need to shuffle the arguments for the call in order to
+ // accommodate GC. The underlying source and destination objects might be
+ // relocated during copy operation should the GC occur. To relocate the
+ // derived source and destination pointers the implementation of the
+ // intrinsic should know the corresponding base pointers.
+ //
+ // To make the base pointers available pass them explicitly as arguments:
+ // memcpy(dest_derived, source_derived, ...) =>
+ // memcpy(dest_base, dest_offset, source_base, source_offset, ...)
+ auto &Context = Call->getContext();
+ auto &DL = Call->getModule()->getDataLayout();
+ auto GetBaseAndOffset = [&](Value *Derived) {
+ assert(Result.PointerToBase.count(Derived));
+ unsigned AddressSpace = Derived->getType()->getPointerAddressSpace();
+ unsigned IntPtrSize = DL.getPointerSizeInBits(AddressSpace);
+ Value *Base = Result.PointerToBase.find(Derived)->second;
+ Value *Base_int = Builder.CreatePtrToInt(
+ Base, Type::getIntNTy(Context, IntPtrSize));
+ Value *Derived_int = Builder.CreatePtrToInt(
+ Derived, Type::getIntNTy(Context, IntPtrSize));
+ return std::make_pair(Base, Builder.CreateSub(Derived_int, Base_int));
+ };
+
+ auto *Dest = CallArgs[0];
+ Value *DestBase, *DestOffset;
+ std::tie(DestBase, DestOffset) = GetBaseAndOffset(Dest);
+
+ auto *Source = CallArgs[1];
+ Value *SourceBase, *SourceOffset;
+ std::tie(SourceBase, SourceOffset) = GetBaseAndOffset(Source);
+
+ auto *LengthInBytes = CallArgs[2];
+ auto *ElementSizeCI = cast<ConstantInt>(CallArgs[3]);
+
+ CallArgs.clear();
+ CallArgs.push_back(DestBase);
+ CallArgs.push_back(DestOffset);
+ CallArgs.push_back(SourceBase);
+ CallArgs.push_back(SourceOffset);
+ CallArgs.push_back(LengthInBytes);
+
+ SmallVector<Type *, 8> DomainTy;
+ for (Value *Arg : CallArgs)
+ DomainTy.push_back(Arg->getType());
+ auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+ /* isVarArg = */ false);
+
+ auto GetFunctionName = [](Intrinsic::ID IID, ConstantInt *ElementSizeCI) {
+ uint64_t ElementSize = ElementSizeCI->getZExtValue();
+ if (IID == Intrinsic::memcpy_element_unordered_atomic) {
+ switch (ElementSize) {
+ case 1:
+ return "__llvm_memcpy_element_unordered_atomic_safepoint_1";
+ case 2:
+ return "__llvm_memcpy_element_unordered_atomic_safepoint_2";
+ case 4:
+ return "__llvm_memcpy_element_unordered_atomic_safepoint_4";
+ case 8:
+ return "__llvm_memcpy_element_unordered_atomic_safepoint_8";
+ case 16:
+ return "__llvm_memcpy_element_unordered_atomic_safepoint_16";
+ default:
+ llvm_unreachable("unexpected element size!");
+ }
+ }
+ assert(IID == Intrinsic::memmove_element_unordered_atomic);
+ switch (ElementSize) {
+ case 1:
+ return "__llvm_memmove_element_unordered_atomic_safepoint_1";
+ case 2:
+ return "__llvm_memmove_element_unordered_atomic_safepoint_2";
+ case 4:
+ return "__llvm_memmove_element_unordered_atomic_safepoint_4";
+ case 8:
+ return "__llvm_memmove_element_unordered_atomic_safepoint_8";
+ case 16:
+ return "__llvm_memmove_element_unordered_atomic_safepoint_16";
+ default:
+ llvm_unreachable("unexpected element size!");
+ }
+ };
+
+ CallTarget =
+ F->getParent()
+ ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy)
+ .getCallee();
}
}
@@ -1940,8 +2036,7 @@ static void relocationViaAlloca(
/// tests in ways which make them less useful in testing fused safepoints.
template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
SmallSet<T, 8> Seen;
- Vec.erase(remove_if(Vec, [&](const T &V) { return !Seen.insert(V).second; }),
- Vec.end());
+ erase_if(Vec, [&](const T &V) { return !Seen.insert(V).second; });
}
/// Insert holders so that each Value is obviously live through the entire
@@ -2013,10 +2108,10 @@ static Value* findRematerializableChainToBasePointer(
// Helper function for the "rematerializeLiveValues". Compute cost of the use
// chain we are going to rematerialize.
-static unsigned
-chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
+static InstructionCost
+chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
TargetTransformInfo &TTI) {
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
for (Instruction *Instr : Chain) {
if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
@@ -2025,8 +2120,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
Type *SrcTy = CI->getOperand(0)->getType();
Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
- TargetTransformInfo::TCK_SizeAndLatency,
- CI);
+ TTI::getCastContextHint(CI),
+ TargetTransformInfo::TCK_SizeAndLatency, CI);
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
// Cost of the address calculation
@@ -2123,7 +2218,7 @@ static void rematerializeLiveValues(CallBase *Call,
assert(Info.LiveSet.count(AlternateRootPhi));
}
// Compute cost of this chain
- unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
+ InstructionCost Cost = chainToBasePointerCost(ChainToBase, TTI);
// TODO: We can also account for cases when we will be able to remove some
// of the rematerialized values by later optimization passes. I.e if
// we rematerialized several intersecting chains. Or if original values
@@ -2404,8 +2499,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
// That Value* no longer exists and we need to use the new gc_result.
// Thankfully, the live set is embedded in the statepoint (and updated), so
// we just grab that.
- Live.insert(Live.end(), Info.StatepointToken->gc_args_begin(),
- Info.StatepointToken->gc_args_end());
+ llvm::append_range(Live, Info.StatepointToken->gc_args());
#ifndef NDEBUG
// Do some basic sanity checks on our liveness results before performing
// relocation. Relocation can and will turn mistakes in liveness results
@@ -2581,8 +2675,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
auto NeedsRewrite = [&TLI](Instruction &I) {
- if (const auto *Call = dyn_cast<CallBase>(&I))
- return !callsGCLeafFunction(Call, TLI) && !isa<GCStatepointInst>(Call);
+ if (const auto *Call = dyn_cast<CallBase>(&I)) {
+ if (isa<GCStatepointInst>(Call))
+ return false;
+ if (callsGCLeafFunction(Call, TLI))
+ return false;
+
+ // Normally it's up to the frontend to make sure that non-leaf calls also
+ // have proper deopt state if it is required. We make an exception for
+ // element atomic memcpy/memmove intrinsics here. Unlike other intrinsics
+ // these are non-leaf by default. They might be generated by the optimizer
+ // which doesn't know how to produce a proper deopt state. So if we see a
+ // non-leaf memcpy/memmove without deopt state just treat it as a leaf
+ // copy and don't produce a statepoint.
+ if (!AllowStatepointWithNoDeoptInfo &&
+ !Call->getOperandBundle(LLVMContext::OB_deopt)) {
+ assert((isa<AtomicMemCpyInst>(Call) || isa<AtomicMemMoveInst>(Call)) &&
+ "Don't expect any other calls here!");
+ return false;
+ }
+ return true;
+ }
return false;
};
@@ -2620,10 +2733,8 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
// of liveness sets for no good reason. It may be harder to do this post
// insertion since relocations and base phis can confuse things.
for (BasicBlock &BB : F)
- if (BB.getUniquePredecessor()) {
- MadeChange = true;
- FoldSingleEntryPHINodes(&BB);
- }
+ if (BB.getUniquePredecessor())
+ MadeChange |= FoldSingleEntryPHINodes(&BB);
// Before we start introducing relocations, we want to tweak the IR a bit to
// avoid unfortunate code generation effects. The main example is that we
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 5ebd3b71fe78..de6be52adf21 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -33,6 +34,7 @@
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueLattice.h"
#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -103,8 +105,7 @@ bool isConstant(const ValueLatticeElement &LV) {
// ValueLatticeElement::isOverdefined() and is intended to be used in the
// transition to ValueLatticeElement.
bool isOverdefined(const ValueLatticeElement &LV) {
- return LV.isOverdefined() ||
- (LV.isConstantRange() && !LV.getConstantRange().isSingleElement());
+ return !LV.isUnknownOrUndef() && !isConstant(LV);
}
//===----------------------------------------------------------------------===//
@@ -233,7 +234,7 @@ public:
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
TrackedMultipleRetVals.insert(
std::make_pair(std::make_pair(F, i), ValueLatticeElement()));
- } else
+ } else if (!F->getReturnType()->isVoidTy())
TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement()));
}
@@ -275,7 +276,7 @@ public:
// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
// block to the 'To' basic block is currently feasible.
- bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+ bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const;
std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
std::vector<ValueLatticeElement> StructValues;
@@ -648,17 +649,30 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
Succs[0] = true;
return;
}
- ValueLatticeElement SCValue = getValueState(SI->getCondition());
- ConstantInt *CI = getConstantInt(SCValue);
+ const ValueLatticeElement &SCValue = getValueState(SI->getCondition());
+ if (ConstantInt *CI = getConstantInt(SCValue)) {
+ Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
+ return;
+ }
- if (!CI) { // Overdefined or unknown condition?
- // All destinations are executable!
- if (!SCValue.isUnknownOrUndef())
- Succs.assign(TI.getNumSuccessors(), true);
+ // TODO: Switch on undef is UB. Stop passing false once the rest of LLVM
+ // is ready.
+ if (SCValue.isConstantRange(/*UndefAllowed=*/false)) {
+ const ConstantRange &Range = SCValue.getConstantRange();
+ for (const auto &Case : SI->cases()) {
+ const APInt &CaseValue = Case.getCaseValue()->getValue();
+ if (Range.contains(CaseValue))
+ Succs[Case.getSuccessorIndex()] = true;
+ }
+
+ // TODO: Determine whether default case is reachable.
+ Succs[SI->case_default()->getSuccessorIndex()] = true;
return;
}
- Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
+ // Overdefined or unknown condition? All destinations are executable!
+ if (!SCValue.isUnknownOrUndef())
+ Succs.assign(TI.getNumSuccessors(), true);
return;
}
@@ -704,7 +718,7 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
// block to the 'To' basic block is currently feasible.
-bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
// Check if we've called markEdgeExecutable on the edge yet. (We could
// be more aggressive and try to consider edges which haven't been marked
// yet, but there isn't any need.)
@@ -829,6 +843,16 @@ void SCCPSolver::visitCastInst(CastInst &I) {
auto &LV = getValueState(&I);
ConstantRange OpRange = OpSt.getConstantRange();
Type *DestTy = I.getDestTy();
+ // Vectors where all elements have the same known constant range are treated
+ // as a single constant range in the lattice. When bitcasting such vectors,
+ // there is a mis-match between the width of the lattice value (single
+ // constant range) and the original operands (vector). Go to overdefined in
+ // that case.
+ if (I.getOpcode() == Instruction::BitCast &&
+ I.getOperand(0)->getType()->isVectorTy() &&
+ OpRange.getBitWidth() < DL.getTypeSizeInBits(DestTy))
+ return (void)markOverdefined(&I);
+
ConstantRange Res =
OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
@@ -1109,7 +1133,9 @@ static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
if (I->getType()->isIntegerTy())
return ValueLatticeElement::getRange(
getConstantRangeFromMetadata(*Ranges));
- // TODO: Also handle MD_nonnull.
+ if (I->hasMetadata(LLVMContext::MD_nonnull))
+ return ValueLatticeElement::getNot(
+ ConstantPointerNull::get(cast<PointerType>(I->getType())));
return ValueLatticeElement::getOverdefined();
}
@@ -1262,55 +1288,33 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
auto *PI = getPredicateInfoFor(&CB);
assert(PI && "Missing predicate info for ssa.copy");
- CmpInst *Cmp;
- bool TrueEdge;
- if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
- Cmp = dyn_cast<CmpInst>(PBranch->Condition);
- TrueEdge = PBranch->TrueEdge;
- } else if (auto *PAssume = dyn_cast<PredicateAssume>(PI)) {
- Cmp = dyn_cast<CmpInst>(PAssume->Condition);
- TrueEdge = true;
- } else {
- mergeInValue(ValueState[&CB], &CB, CopyOfVal);
- return;
- }
-
- // Everything below relies on the condition being a comparison.
- if (!Cmp) {
+ const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
+ if (!Constraint) {
mergeInValue(ValueState[&CB], &CB, CopyOfVal);
return;
}
- Value *RenamedOp = PI->RenamedOp;
- Value *CmpOp0 = Cmp->getOperand(0);
- Value *CmpOp1 = Cmp->getOperand(1);
- // Bail out if neither of the operands matches RenamedOp.
- if (CmpOp0 != RenamedOp && CmpOp1 != RenamedOp) {
- mergeInValue(ValueState[&CB], &CB, getValueState(CopyOf));
- return;
- }
+ CmpInst::Predicate Pred = Constraint->Predicate;
+ Value *OtherOp = Constraint->OtherOp;
- auto Pred = Cmp->getPredicate();
- if (CmpOp1 == RenamedOp) {
- std::swap(CmpOp0, CmpOp1);
- Pred = Cmp->getSwappedPredicate();
- }
-
- // Wait until CmpOp1 is resolved.
- if (getValueState(CmpOp1).isUnknown()) {
- addAdditionalUser(CmpOp1, &CB);
+ // Wait until OtherOp is resolved.
+ if (getValueState(OtherOp).isUnknown()) {
+ addAdditionalUser(OtherOp, &CB);
return;
}
- // The code below relies on PredicateInfo only inserting copies for the
- // true branch when the branch condition is an AND and only inserting
- // copies for the false branch when the branch condition is an OR. This
- // ensures we can intersect the range from the condition with the range of
- // CopyOf.
- if (!TrueEdge)
- Pred = CmpInst::getInversePredicate(Pred);
-
- ValueLatticeElement CondVal = getValueState(CmpOp1);
+ // TODO: Actually filp MayIncludeUndef for the created range to false,
+ // once most places in the optimizer respect the branches on
+ // undef/poison are UB rule. The reason why the new range cannot be
+ // undef is as follows below:
+ // The new range is based on a branch condition. That guarantees that
+ // neither of the compare operands can be undef in the branch targets,
+ // unless we have conditions that are always true/false (e.g. icmp ule
+ // i32, %a, i32_max). For the latter overdefined/empty range will be
+ // inferred, but the branch will get folded accordingly anyways.
+ bool MayIncludeUndef = !isa<PredicateAssume>(PI);
+
+ ValueLatticeElement CondVal = getValueState(OtherOp);
ValueLatticeElement &IV = ValueState[&CB];
if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
auto ImposedCR =
@@ -1334,30 +1338,47 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
NewCR = CopyOfCR;
- addAdditionalUser(CmpOp1, &CB);
- // TODO: Actually filp MayIncludeUndef for the created range to false,
- // once most places in the optimizer respect the branches on
- // undef/poison are UB rule. The reason why the new range cannot be
- // undef is as follows below:
- // The new range is based on a branch condition. That guarantees that
- // neither of the compare operands can be undef in the branch targets,
- // unless we have conditions that are always true/false (e.g. icmp ule
- // i32, %a, i32_max). For the latter overdefined/empty range will be
- // inferred, but the branch will get folded accordingly anyways.
+ addAdditionalUser(OtherOp, &CB);
mergeInValue(
IV, &CB,
- ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef=*/true));
+ ValueLatticeElement::getRange(NewCR, MayIncludeUndef));
return;
} else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
// For non-integer values or integer constant expressions, only
// propagate equal constants.
- addAdditionalUser(CmpOp1, &CB);
+ addAdditionalUser(OtherOp, &CB);
mergeInValue(IV, &CB, CondVal);
return;
+ } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant() &&
+ !MayIncludeUndef) {
+ // Propagate inequalities.
+ addAdditionalUser(OtherOp, &CB);
+ mergeInValue(IV, &CB,
+ ValueLatticeElement::getNot(CondVal.getConstant()));
+ return;
}
return (void)mergeInValue(IV, &CB, CopyOfVal);
}
+
+ if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
+ // Compute result range for intrinsics supported by ConstantRange.
+ // Do this even if we don't know a range for all operands, as we may
+ // still know something about the result range, e.g. of abs(x).
+ SmallVector<ConstantRange, 2> OpRanges;
+ for (Value *Op : II->args()) {
+ const ValueLatticeElement &State = getValueState(Op);
+ if (State.isConstantRange())
+ OpRanges.push_back(State.getConstantRange());
+ else
+ OpRanges.push_back(
+ ConstantRange::getFull(Op->getType()->getScalarSizeInBits()));
+ }
+
+ ConstantRange Result =
+ ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
+ return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+ }
}
// The common case is that we aren't tracking the callee, either because we
@@ -1427,8 +1448,7 @@ void SCCPSolver::Solve() {
// Process the basic block work list.
while (!BBWorkList.empty()) {
- BasicBlock *BB = BBWorkList.back();
- BBWorkList.pop_back();
+ BasicBlock *BB = BBWorkList.pop_back_val();
LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
@@ -1456,6 +1476,7 @@ void SCCPSolver::Solve() {
/// This scan also checks for values that use undefs. It conservatively marks
/// them as overdefined.
bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+ bool MadeChange = false;
for (BasicBlock &BB : F) {
if (!BBExecutable.count(&BB))
continue;
@@ -1481,8 +1502,10 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// more precise than this but it isn't worth bothering.
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
ValueLatticeElement &LV = getStructValueState(&I, i);
- if (LV.isUnknownOrUndef())
+ if (LV.isUnknownOrUndef()) {
markOverdefined(LV, &I);
+ MadeChange = true;
+ }
}
continue;
}
@@ -1509,7 +1532,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
markOverdefined(&I);
- return true;
+ MadeChange = true;
}
// Check to see if we have a branch or switch on an undefined value. If so
@@ -1526,7 +1549,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
if (isa<UndefValue>(BI->getCondition())) {
BI->setCondition(ConstantInt::getFalse(BI->getContext()));
markEdgeExecutable(&BB, TI->getSuccessor(1));
- return true;
+ MadeChange = true;
+ continue;
}
// Otherwise, it is a branch on a symbolic value which is currently
@@ -1535,7 +1559,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// FIXME: Distinguish between dead code and an LLVM "undef" value.
BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
if (markEdgeExecutable(&BB, DefaultSuccessor))
- return true;
+ MadeChange = true;
continue;
}
@@ -1554,7 +1578,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
if (isa<UndefValue>(IBR->getAddress())) {
IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
markEdgeExecutable(&BB, IBR->getSuccessor(0));
- return true;
+ MadeChange = true;
+ continue;
}
// Otherwise, it is a branch on a symbolic value which is currently
@@ -1564,7 +1589,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// we can assume the branch has undefined behavior instead.
BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
if (markEdgeExecutable(&BB, DefaultSuccessor))
- return true;
+ MadeChange = true;
continue;
}
@@ -1579,7 +1604,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
if (isa<UndefValue>(SI->getCondition())) {
SI->setCondition(SI->case_begin()->getCaseValue());
markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
- return true;
+ MadeChange = true;
+ continue;
}
// Otherwise, it is a branch on a symbolic value which is currently
@@ -1588,13 +1614,13 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// FIXME: Distinguish between dead code and an LLVM "undef" value.
BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
if (markEdgeExecutable(&BB, DefaultSuccessor))
- return true;
+ MadeChange = true;
continue;
}
}
- return false;
+ return MadeChange;
}
static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
@@ -1716,7 +1742,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
++NumDeadBlocks;
- NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
+ NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB).first;
MadeChanges = true;
continue;
@@ -1839,39 +1865,68 @@ static void findReturnsToZap(Function &F,
}
}
-// Update the condition for terminators that are branching on indeterminate
-// values, forcing them to use a specific edge.
-static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
- BasicBlock *Dest = nullptr;
- Constant *C = nullptr;
- if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
- if (!isa<ConstantInt>(SI->getCondition())) {
- // Indeterminate switch; use first case value.
- Dest = SI->case_begin()->getCaseSuccessor();
- C = SI->case_begin()->getCaseValue();
- }
- } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
- if (!isa<ConstantInt>(BI->getCondition())) {
- // Indeterminate branch; use false.
- Dest = BI->getSuccessor(1);
- C = ConstantInt::getFalse(BI->getContext());
+static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
+ DomTreeUpdater &DTU) {
+ SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
+ bool HasNonFeasibleEdges = false;
+ for (BasicBlock *Succ : successors(BB)) {
+ if (Solver.isEdgeFeasible(BB, Succ))
+ FeasibleSuccessors.insert(Succ);
+ else
+ HasNonFeasibleEdges = true;
+ }
+
+ // All edges feasible, nothing to do.
+ if (!HasNonFeasibleEdges)
+ return false;
+
+ // SCCP can only determine non-feasible edges for br, switch and indirectbr.
+ Instruction *TI = BB->getTerminator();
+ assert((isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+ isa<IndirectBrInst>(TI)) &&
+ "Terminator must be a br, switch or indirectbr");
+
+ if (FeasibleSuccessors.size() == 1) {
+ // Replace with an unconditional branch to the only feasible successor.
+ BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin();
+ SmallVector<DominatorTree::UpdateType, 8> Updates;
+ bool HaveSeenOnlyFeasibleSuccessor = false;
+ for (BasicBlock *Succ : successors(BB)) {
+ if (Succ == OnlyFeasibleSuccessor && !HaveSeenOnlyFeasibleSuccessor) {
+ // Don't remove the edge to the only feasible successor the first time
+ // we see it. We still do need to remove any multi-edges to it though.
+ HaveSeenOnlyFeasibleSuccessor = true;
+ continue;
+ }
+
+ Succ->removePredecessor(BB);
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
}
- } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) {
- if (!isa<BlockAddress>(IBR->getAddress()->stripPointerCasts())) {
- // Indeterminate indirectbr; use successor 0.
- Dest = IBR->getSuccessor(0);
- C = BlockAddress::get(IBR->getSuccessor(0));
+
+ BranchInst::Create(OnlyFeasibleSuccessor, BB);
+ TI->eraseFromParent();
+ DTU.applyUpdatesPermissive(Updates);
+ } else if (FeasibleSuccessors.size() > 1) {
+ SwitchInstProfUpdateWrapper SI(*cast<SwitchInst>(TI));
+ SmallVector<DominatorTree::UpdateType, 8> Updates;
+ for (auto CI = SI->case_begin(); CI != SI->case_end();) {
+ if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) {
+ ++CI;
+ continue;
+ }
+
+ BasicBlock *Succ = CI->getCaseSuccessor();
+ Succ->removePredecessor(BB);
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ SI.removeCase(CI);
+ // Don't increment CI, as we removed a case.
}
- } else {
- llvm_unreachable("Unexpected terminator instruction");
- }
- if (C) {
- assert(Solver.isEdgeFeasible(I->getParent(), Dest) &&
- "Didn't find feasible edge?");
- (void)Dest;
- I->setOperand(0, C);
+ DTU.applyUpdatesPermissive(Updates);
+ } else {
+ llvm_unreachable("Must have at least one feasible successor");
}
+ return true;
}
bool llvm::runIPSCCP(
@@ -1923,13 +1978,12 @@ bool llvm::runIPSCCP(
while (ResolvedUndefs) {
LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
ResolvedUndefs = false;
- for (Function &F : M)
- if (Solver.ResolvedUndefsIn(F)) {
- // We run Solve() after we resolved an undef in a function, because
- // we might deduce a fact that eliminates an undef in another function.
- Solver.Solve();
+ for (Function &F : M) {
+ if (Solver.ResolvedUndefsIn(F))
ResolvedUndefs = true;
- }
+ }
+ if (ResolvedUndefs)
+ Solver.Solve();
}
bool MadeChanges = false;
@@ -1943,15 +1997,35 @@ bool llvm::runIPSCCP(
SmallVector<BasicBlock *, 512> BlocksToErase;
- if (Solver.isBlockExecutable(&F.front()))
- for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
- ++AI) {
- if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) {
+ if (Solver.isBlockExecutable(&F.front())) {
+ bool ReplacedPointerArg = false;
+ for (Argument &Arg : F.args()) {
+ if (!Arg.use_empty() && tryToReplaceWithConstant(Solver, &Arg)) {
+ ReplacedPointerArg |= Arg.getType()->isPointerTy();
++IPNumArgsElimed;
- continue;
}
}
+ // If we replaced an argument, the argmemonly and
+ // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
+ // them from both the function and callsites.
+ if (ReplacedPointerArg) {
+ AttrBuilder AttributesToRemove;
+ AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
+ AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
+ F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove);
+
+ for (User *U : F.users()) {
+ auto *CB = dyn_cast<CallBase>(U);
+ if (!CB || CB->getCalledFunction() != &F)
+ continue;
+
+ CB->removeAttributes(AttributeList::FunctionIndex,
+ AttributesToRemove);
+ }
+ }
+ }
+
SmallPtrSet<Value *, 32> InsertedValues;
for (BasicBlock &BB : F) {
if (!Solver.isBlockExecutable(&BB)) {
@@ -1984,45 +2058,11 @@ bool llvm::runIPSCCP(
/*UseLLVMTrap=*/false,
/*PreserveLCSSA=*/false, &DTU);
- // Now that all instructions in the function are constant folded,
- // use ConstantFoldTerminator to get rid of in-edges, record DT updates and
- // delete dead BBs.
- for (BasicBlock *DeadBB : BlocksToErase) {
- // If there are any PHI nodes in this successor, drop entries for BB now.
- for (Value::user_iterator UI = DeadBB->user_begin(),
- UE = DeadBB->user_end();
- UI != UE;) {
- // Grab the user and then increment the iterator early, as the user
- // will be deleted. Step past all adjacent uses from the same user.
- auto *I = dyn_cast<Instruction>(*UI);
- do { ++UI; } while (UI != UE && *UI == I);
-
- // Ignore blockaddress users; BasicBlock's dtor will handle them.
- if (!I) continue;
-
- // If we have forced an edge for an indeterminate value, then force the
- // terminator to fold to that edge.
- forceIndeterminateEdge(I, Solver);
- BasicBlock *InstBB = I->getParent();
- bool Folded = ConstantFoldTerminator(InstBB,
- /*DeleteDeadConditions=*/false,
- /*TLI=*/nullptr, &DTU);
- assert(Folded &&
- "Expect TermInst on constantint or blockaddress to be folded");
- (void) Folded;
- // If we folded the terminator to an unconditional branch to another
- // dead block, replace it with Unreachable, to avoid trying to fold that
- // branch again.
- BranchInst *BI = cast<BranchInst>(InstBB->getTerminator());
- if (BI && BI->isUnconditional() &&
- !Solver.isBlockExecutable(BI->getSuccessor(0))) {
- InstBB->getTerminator()->eraseFromParent();
- new UnreachableInst(InstBB->getContext(), InstBB);
- }
- }
- // Mark dead BB for deletion.
+ for (BasicBlock &BB : F)
+ MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
+
+ for (BasicBlock *DeadBB : BlocksToErase)
DTU.deleteBB(DeadBB);
- }
for (BasicBlock &BB : F) {
for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
@@ -2054,9 +2094,47 @@ bool llvm::runIPSCCP(
for (const auto &I : Solver.getTrackedRetVals()) {
Function *F = I.first;
- if (isOverdefined(I.second) || F->getReturnType()->isVoidTy())
+ const ValueLatticeElement &ReturnValue = I.second;
+
+ // If there is a known constant range for the return value, add !range
+ // metadata to the function's call sites.
+ if (ReturnValue.isConstantRange() &&
+ !ReturnValue.getConstantRange().isSingleElement()) {
+ // Do not add range metadata if the return value may include undef.
+ if (ReturnValue.isConstantRangeIncludingUndef())
+ continue;
+
+ auto &CR = ReturnValue.getConstantRange();
+ for (User *User : F->users()) {
+ auto *CB = dyn_cast<CallBase>(User);
+ if (!CB || CB->getCalledFunction() != F)
+ continue;
+
+ // Limit to cases where the return value is guaranteed to be neither
+ // poison nor undef. Poison will be outside any range and currently
+ // values outside of the specified range cause immediate undefined
+ // behavior.
+ if (!isGuaranteedNotToBeUndefOrPoison(CB, nullptr, CB))
+ continue;
+
+ // Do not touch existing metadata for now.
+ // TODO: We should be able to take the intersection of the existing
+ // metadata and the inferred range.
+ if (CB->getMetadata(LLVMContext::MD_range))
+ continue;
+
+ LLVMContext &Context = CB->getParent()->getContext();
+ Metadata *RangeMD[] = {
+ ConstantAsMetadata::get(ConstantInt::get(Context, CR.getLower())),
+ ConstantAsMetadata::get(ConstantInt::get(Context, CR.getUpper()))};
+ CB->setMetadata(LLVMContext::MD_range, MDNode::get(Context, RangeMD));
+ }
continue;
- findReturnsToZap(*F, ReturnsToZap, Solver);
+ }
+ if (F->getReturnType()->isVoidTy())
+ continue;
+ if (isConstant(ReturnValue) || ReturnValue.isUnknownOrUndef())
+ findReturnsToZap(*F, ReturnsToZap, Solver);
}
for (auto F : Solver.getMRVFunctionsTracked()) {
@@ -2068,9 +2146,27 @@ bool llvm::runIPSCCP(
}
// Zap all returns which we've identified as zap to change.
+ SmallSetVector<Function *, 8> FuncZappedReturn;
for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
Function *F = ReturnsToZap[i]->getParent()->getParent();
ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+ // Record all functions that are zapped.
+ FuncZappedReturn.insert(F);
+ }
+
+ // Remove the returned attribute for zapped functions and the
+ // corresponding call sites.
+ for (Function *F : FuncZappedReturn) {
+ for (Argument &A : F->args())
+ F->removeParamAttr(A.getArgNo(), Attribute::Returned);
+ for (Use &U : F->uses()) {
+ // Skip over blockaddr users.
+ if (isa<BlockAddress>(U.getUser()))
+ continue;
+ CallBase *CB = cast<CallBase>(U.getUser());
+ for (Use &Arg : CB->args())
+ CB->removeParamAttr(CB->getArgOperandNo(&Arg), Attribute::Returned);
+ }
}
// If we inferred constant or undef values for globals variables, we can
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 89f324deef9f..d111a6ba4241 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -268,6 +268,11 @@ public:
/// Access the dead users for this alloca.
ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+ /// Access Uses that should be dropped if the alloca is promotable.
+ ArrayRef<Use *> getDeadUsesIfPromotable() const {
+ return DeadUseIfPromotable;
+ }
+
/// Access the dead operands referring to this alloca.
///
/// These are operands which have cannot actually be used to refer to the
@@ -322,6 +327,9 @@ private:
/// they come from outside of the allocated space.
SmallVector<Instruction *, 8> DeadUsers;
+ /// Uses which will become dead if can promote the alloca.
+ SmallVector<Use *, 8> DeadUseIfPromotable;
+
/// Operands which will become dead if we rewrite the alloca.
///
/// These are operands that in their particular use can be replaced with
@@ -459,12 +467,8 @@ class AllocaSlices::partition_iterator
// Remove the uses which have ended in the prior partition. This
// cannot change the max split slice end because we just checked that
// the prior partition ended prior to that max.
- P.SplitTails.erase(llvm::remove_if(P.SplitTails,
- [&](Slice *S) {
- return S->endOffset() <=
- P.EndOffset;
- }),
- P.SplitTails.end());
+ llvm::erase_if(P.SplitTails,
+ [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
assert(llvm::any_of(P.SplitTails,
[&](Slice *S) {
return S->endOffset() == MaxSplitSliceEndOffset;
@@ -780,6 +784,9 @@ private:
LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
return PI.setAborted(&LI);
+ if (isa<ScalableVectorType>(LI.getType()))
+ return PI.setAborted(&LI);
+
uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
}
@@ -795,6 +802,9 @@ private:
SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
return PI.setAborted(&SI);
+ if (isa<ScalableVectorType>(ValOp->getType()))
+ return PI.setAborted(&SI);
+
uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
// If this memory access can be shown to *statically* extend outside the
@@ -920,6 +930,11 @@ private:
// FIXME: What about debug intrinsics? This matches old behavior, but
// doesn't make sense.
void visitIntrinsicInst(IntrinsicInst &II) {
+ if (II.isDroppable()) {
+ AS.DeadUseIfPromotable.push_back(U);
+ return;
+ }
+
if (!IsOffsetKnown)
return PI.setAborted(&II);
@@ -1057,13 +1072,11 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
return;
}
- Slices.erase(
- llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
- Slices.end());
+ llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
// Sort the uses. This arranges for the offsets to be in ascending order,
// and the sizes to be in descending order.
- std::stable_sort(Slices.begin(), Slices.end());
+ llvm::stable_sort(Slices);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1109,9 +1122,9 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
/// Walk the range of a partitioning looking for a common type to cover this
/// sequence of slices.
-static Type *findCommonType(AllocaSlices::const_iterator B,
- AllocaSlices::const_iterator E,
- uint64_t EndOffset) {
+static std::pair<Type *, IntegerType *>
+findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
+ uint64_t EndOffset) {
Type *Ty = nullptr;
bool TyIsCommon = true;
IntegerType *ITy = nullptr;
@@ -1155,7 +1168,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
Ty = UserTy;
}
- return TyIsCommon ? Ty : ITy;
+ return {TyIsCommon ? Ty : nullptr, ITy};
}
/// PHI instructions that use an alloca and are subsequently loaded can be
@@ -1379,7 +1392,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
/// This will return the BasePtr if that is valid, or build a new GEP
/// instruction using the IRBuilder if GEP-ing is needed.
static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
- SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
+ SmallVectorImpl<Value *> &Indices,
+ const Twine &NamePrefix) {
if (Indices.empty())
return BasePtr;
@@ -1404,7 +1418,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
Value *BasePtr, Type *Ty, Type *TargetTy,
SmallVectorImpl<Value *> &Indices,
- Twine NamePrefix) {
+ const Twine &NamePrefix) {
if (Ty == TargetTy)
return buildGEP(IRB, BasePtr, Indices, NamePrefix);
@@ -1449,7 +1463,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
Value *Ptr, Type *Ty, APInt &Offset,
Type *TargetTy,
SmallVectorImpl<Value *> &Indices,
- Twine NamePrefix) {
+ const Twine &NamePrefix) {
if (Offset == 0)
return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
NamePrefix);
@@ -1524,7 +1538,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
Value *Ptr, APInt Offset, Type *TargetTy,
SmallVectorImpl<Value *> &Indices,
- Twine NamePrefix) {
+ const Twine &NamePrefix) {
PointerType *Ty = cast<PointerType>(Ptr->getType());
// Don't consider any GEPs through an i8* as natural unless the TargetTy is
@@ -1535,6 +1549,8 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
Type *ElementTy = Ty->getElementType();
if (!ElementTy->isSized())
return nullptr; // We can't GEP through an unsized element.
+ if (isa<ScalableVectorType>(ElementTy))
+ return nullptr;
APInt ElementSize(Offset.getBitWidth(),
DL.getTypeAllocSize(ElementTy).getFixedSize());
if (ElementSize == 0)
@@ -1563,7 +1579,8 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
/// a single GEP as possible, thus making each GEP more independent of the
/// surrounding code.
static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
- APInt Offset, Type *PointerTy, Twine NamePrefix) {
+ APInt Offset, Type *PointerTy,
+ const Twine &NamePrefix) {
// Even though we don't look through PHI nodes, we could be called on an
// instruction in an unreachable block, which may be on a cycle.
SmallPtrSet<Value *, 4> Visited;
@@ -1825,7 +1842,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
if (!S.isSplittable())
return false; // Skip any unsplittable intrinsics.
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
- if (!II->isLifetimeStartOrEnd())
+ if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
} else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
// Disable vector promotion when there are loads or stores of an FCA.
@@ -1909,12 +1926,9 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// do that until all the backends are known to produce good code for all
// integer vector types.
if (!HaveCommonEltTy) {
- CandidateTys.erase(
- llvm::remove_if(CandidateTys,
- [](VectorType *VTy) {
- return !VTy->getElementType()->isIntegerTy();
- }),
- CandidateTys.end());
+ llvm::erase_if(CandidateTys, [](VectorType *VTy) {
+ return !VTy->getElementType()->isIntegerTy();
+ });
// If there were no integer vector types, give up.
if (CandidateTys.empty())
@@ -2058,7 +2072,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
if (!S.isSplittable())
return false; // Skip any unsplittable intrinsics.
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
- if (!II->isLifetimeStartOrEnd())
+ if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
} else {
return false;
@@ -2099,8 +2113,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
// that we cover the alloca.
// FIXME: We shouldn't consider split slices that happen to start in the
// partition here...
- bool WholeAllocaOp =
- P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+ bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
for (const Slice &S : P)
if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
@@ -2193,8 +2206,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
Mask.reserve(NumElements);
for (unsigned i = BeginIndex; i != EndIndex; ++i)
Mask.push_back(i);
- V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask,
- Name + ".extract");
+ V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
return V;
}
@@ -2227,22 +2239,22 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// use a shuffle vector to widen it with undef elements, and then
// a second shuffle vector to select between the loaded vector and the
// incoming vector.
- SmallVector<Constant *, 8> Mask;
+ SmallVector<int, 8> Mask;
Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
if (i >= BeginIndex && i < EndIndex)
- Mask.push_back(IRB.getInt32(i - BeginIndex));
+ Mask.push_back(i - BeginIndex);
else
- Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
- V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
- ConstantVector::get(Mask), Name + ".expand");
+ Mask.push_back(-1);
+ V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
- Mask.clear();
+ SmallVector<Constant *, 8> Mask2;
+ Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
- Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+ Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
- V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+ V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
return V;
@@ -2446,7 +2458,7 @@ private:
void deleteIfTriviallyDead(Value *V) {
Instruction *I = cast<Instruction>(V);
if (isInstructionTriviallyDead(I))
- Pass.DeadInsts.insert(I);
+ Pass.DeadInsts.push_back(I);
}
Value *rewriteVectorizedLoadInst() {
@@ -2586,7 +2598,7 @@ private:
LI.replaceAllUsesWith(V);
}
- Pass.DeadInsts.insert(&LI);
+ Pass.DeadInsts.push_back(&LI);
deleteIfTriviallyDead(OldOp);
LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
return !LI.isVolatile() && !IsPtrAdjusted;
@@ -2615,7 +2627,7 @@ private:
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
if (AATags)
Store->setAAMetadata(AATags);
- Pass.DeadInsts.insert(&SI);
+ Pass.DeadInsts.push_back(&SI);
LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
@@ -2639,7 +2651,7 @@ private:
LLVMContext::MD_access_group});
if (AATags)
Store->setAAMetadata(AATags);
- Pass.DeadInsts.insert(&SI);
+ Pass.DeadInsts.push_back(&SI);
LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
}
@@ -2713,7 +2725,7 @@ private:
NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
if (NewSI->isAtomic())
NewSI->setAlignment(SI.getAlign());
- Pass.DeadInsts.insert(&SI);
+ Pass.DeadInsts.push_back(&SI);
deleteIfTriviallyDead(OldOp);
LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
@@ -2774,11 +2786,11 @@ private:
}
// Record this instruction for deletion.
- Pass.DeadInsts.insert(&II);
+ Pass.DeadInsts.push_back(&II);
Type *AllocaTy = NewAI.getAllocatedType();
Type *ScalarTy = AllocaTy->getScalarType();
-
+
const bool CanContinue = [&]() {
if (VecTy || IntTy)
return true;
@@ -2944,7 +2956,7 @@ private:
return false;
}
// Record this instruction for deletion.
- Pass.DeadInsts.insert(&II);
+ Pass.DeadInsts.push_back(&II);
// Strip all inbounds GEPs and pointer casts to try to dig out any root
// alloca that should be re-examined after rewriting this instruction.
@@ -3074,13 +3086,21 @@ private:
}
bool visitIntrinsicInst(IntrinsicInst &II) {
- assert(II.isLifetimeStartOrEnd());
+ assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
+ "Unexpected intrinsic!");
LLVM_DEBUG(dbgs() << " original: " << II << "\n");
- assert(II.getArgOperand(1) == OldPtr);
// Record this instruction for deletion.
- Pass.DeadInsts.insert(&II);
+ Pass.DeadInsts.push_back(&II);
+
+ if (II.isDroppable()) {
+ assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
+ // TODO For now we forget assumed information, this can be improved.
+ OldPtr->dropDroppableUsesIn(II);
+ return true;
+ }
+ assert(II.getArgOperand(1) == OldPtr);
// Lifetime intrinsics are only promotable if they cover the whole alloca.
// Therefore, we drop lifetime intrinsics which don't cover the whole
// alloca.
@@ -3455,7 +3475,7 @@ private:
<< "\n " << GEPI);
IRBuilderTy Builder(&GEPI);
- SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+ SmallVector<Value *, 4> Index(GEPI.indices());
bool IsInBounds = GEPI.isInBounds();
Value *True = Sel->getTrueValue();
@@ -3509,20 +3529,27 @@ private:
<< "\n " << GEPI
<< "\n to: ");
- SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+ SmallVector<Value *, 4> Index(GEPI.indices());
bool IsInBounds = GEPI.isInBounds();
IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
PHI->getNumIncomingValues(),
PHI->getName() + ".sroa.phi");
for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
- Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+ BasicBlock *B = PHI->getIncomingBlock(I);
+ Value *NewVal = nullptr;
+ int Idx = NewPN->getBasicBlockIndex(B);
+ if (Idx >= 0) {
+ NewVal = NewPN->getIncomingValue(Idx);
+ } else {
+ Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
- IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
- Value *NewVal = IsInBounds
- ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
- : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
- NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
+ IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+ NewVal = IsInBounds
+ ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
+ : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
+ }
+ NewPN->addIncoming(NewVal, B);
}
Visited.erase(&GEPI);
@@ -3864,63 +3891,53 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// such loads and stores, we can only pre-split them if their splits exactly
// match relative to their starting offset. We have to verify this prior to
// any rewriting.
- Stores.erase(
- llvm::remove_if(Stores,
- [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
- // Lookup the load we are storing in our map of split
- // offsets.
- auto *LI = cast<LoadInst>(SI->getValueOperand());
- // If it was completely unsplittable, then we're done,
- // and this store can't be pre-split.
- if (UnsplittableLoads.count(LI))
- return true;
-
- auto LoadOffsetsI = SplitOffsetsMap.find(LI);
- if (LoadOffsetsI == SplitOffsetsMap.end())
- return false; // Unrelated loads are definitely safe.
- auto &LoadOffsets = LoadOffsetsI->second;
-
- // Now lookup the store's offsets.
- auto &StoreOffsets = SplitOffsetsMap[SI];
-
- // If the relative offsets of each split in the load and
- // store match exactly, then we can split them and we
- // don't need to remove them here.
- if (LoadOffsets.Splits == StoreOffsets.Splits)
- return false;
-
- LLVM_DEBUG(
- dbgs()
- << " Mismatched splits for load and store:\n"
- << " " << *LI << "\n"
- << " " << *SI << "\n");
-
- // We've found a store and load that we need to split
- // with mismatched relative splits. Just give up on them
- // and remove both instructions from our list of
- // candidates.
- UnsplittableLoads.insert(LI);
- return true;
- }),
- Stores.end());
+ llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
+
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ });
// Now we have to go *back* through all the stores, because a later store may
// have caused an earlier store's load to become unsplittable and if it is
// unsplittable for the later store, then we can't rely on it being split in
// the earlier store either.
- Stores.erase(llvm::remove_if(Stores,
- [&UnsplittableLoads](StoreInst *SI) {
- auto *LI =
- cast<LoadInst>(SI->getValueOperand());
- return UnsplittableLoads.count(LI);
- }),
- Stores.end());
+ llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ });
// Once we've established all the loads that can't be split for some reason,
// filter any that made it into our list out.
- Loads.erase(llvm::remove_if(Loads,
- [&UnsplittableLoads](LoadInst *LI) {
- return UnsplittableLoads.count(LI);
- }),
- Loads.end());
+ llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ });
// If no loads or stores are left, there is no pre-splitting to be done for
// this alloca.
@@ -4057,7 +4074,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
}
// Mark the original store as dead.
- DeadInsts.insert(SI);
+ DeadInsts.push_back(SI);
}
// Save the split loads if there are deferred stores among the users.
@@ -4065,7 +4082,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
// Mark the original load as dead and kill the original slice.
- DeadInsts.insert(LI);
+ DeadInsts.push_back(LI);
Offsets.S->kill();
}
@@ -4187,15 +4204,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// trivial CSE, including instcombine.
if (LI->hasOneUse()) {
assert(*LI->user_begin() == SI && "Single use isn't this store!");
- DeadInsts.insert(LI);
+ DeadInsts.push_back(LI);
}
- DeadInsts.insert(SI);
+ DeadInsts.push_back(SI);
Offsets.S->kill();
}
// Remove the killed slices that have ben pre-split.
- AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }),
- AS.end());
+ llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
// Insert our new slices. This will sort and merge them into the sorted
// sequence.
@@ -4209,11 +4225,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// Finally, don't try to promote any allocas that new require re-splitting.
// They have already been added to the worklist above.
- PromotableAllocas.erase(
- llvm::remove_if(
- PromotableAllocas,
- [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
- PromotableAllocas.end());
+ llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
+ return ResplitPromotableAllocas.count(AI);
+ });
return true;
}
@@ -4235,13 +4249,21 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// or an i8 array of an appropriate size.
Type *SliceTy = nullptr;
const DataLayout &DL = AI.getModule()->getDataLayout();
- if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
- if (DL.getTypeAllocSize(CommonUseTy).getFixedSize() >= P.size())
- SliceTy = CommonUseTy;
+ std::pair<Type *, IntegerType *> CommonUseTy =
+ findCommonType(P.begin(), P.end(), P.endOffset());
+ // Do all uses operate on the same type?
+ if (CommonUseTy.first)
+ if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
+ SliceTy = CommonUseTy.first;
+ // If not, can we find an appropriate subtype in the original allocated type?
if (!SliceTy)
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size()))
SliceTy = TypePartitionTy;
+ // If still not, can we use the largest bitwidth integer type used?
+ if (!SliceTy && CommonUseTy.second)
+ if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
+ SliceTy = CommonUseTy.second;
if ((!SliceTy || (SliceTy->isArrayTy() &&
SliceTy->getArrayElementType()->isIntegerTy())) &&
DL.isLegalInteger(P.size() * 8))
@@ -4331,6 +4353,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
}
if (Promotable) {
+ for (Use *U : AS.getDeadUsesIfPromotable()) {
+ auto *OldInst = dyn_cast<Instruction>(U->get());
+ Value::dropDroppableUse(*U);
+ if (OldInst)
+ if (isInstructionTriviallyDead(OldInst))
+ DeadInsts.push_back(OldInst);
+ }
if (PHIUsers.empty() && SelectUsers.empty()) {
// Promote the alloca.
PromotableAllocas.push_back(NewAI);
@@ -4465,10 +4494,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
// Migrate debug information from the old alloca to the new alloca(s)
// and the individual partitions.
TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
- if (!DbgDeclares.empty()) {
- auto *Var = DbgDeclares.front()->getVariable();
- auto *Expr = DbgDeclares.front()->getExpression();
- auto VarSize = Var->getSizeInBits();
+ for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
+ auto *Expr = DbgDeclare->getExpression();
DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
uint64_t AllocaSize =
DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
@@ -4499,6 +4526,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
}
// The alloca may be larger than the variable.
+ auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
if (VarSize) {
if (Size > *VarSize)
Size = *VarSize;
@@ -4516,12 +4544,21 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
}
}
- // Remove any existing intrinsics describing the same alloca.
- for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca))
- OldDII->eraseFromParent();
+ // Remove any existing intrinsics on the new alloca describing
+ // the variable fragment.
+ for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
+ auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
+ const DbgVariableIntrinsic *RHS) {
+ return LHS->getVariable() == RHS->getVariable() &&
+ LHS->getDebugLoc()->getInlinedAt() ==
+ RHS->getDebugLoc()->getInlinedAt();
+ };
+ if (SameVariableFragment(OldDII, DbgDeclare))
+ OldDII->eraseFromParent();
+ }
- DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
- DbgDeclares.front()->getDebugLoc(), &AI);
+ DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
+ DbgDeclare->getDebugLoc(), &AI);
}
}
return Changed;
@@ -4538,7 +4575,7 @@ void SROA::clobberUse(Use &U) {
// minimal.
if (Instruction *OldI = dyn_cast<Instruction>(OldV))
if (isInstructionTriviallyDead(OldI)) {
- DeadInsts.insert(OldI);
+ DeadInsts.push_back(OldI);
}
}
@@ -4587,7 +4624,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
// And mark it for deletion.
- DeadInsts.insert(DeadUser);
+ DeadInsts.push_back(DeadUser);
Changed = true;
}
for (Use *DeadOp : AS.getDeadOperands()) {
@@ -4625,7 +4662,8 @@ bool SROA::deleteDeadInstructions(
SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
bool Changed = false;
while (!DeadInsts.empty()) {
- Instruction *I = DeadInsts.pop_back_val();
+ Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
+ if (!I) continue;
LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
// If the instruction is an alloca, find the possible dbg.declare connected
@@ -4644,7 +4682,7 @@ bool SROA::deleteDeadInstructions(
// Zero out the operand and see if it becomes trivially dead.
Operand = nullptr;
if (isInstructionTriviallyDead(U))
- DeadInsts.insert(U);
+ DeadInsts.push_back(U);
}
++NumDeleted;
@@ -4707,8 +4745,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
Worklist.remove_if(IsInSet);
PostPromotionWorklist.remove_if(IsInSet);
- PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet),
- PromotableAllocas.end());
+ llvm::erase_if(PromotableAllocas, IsInSet);
DeletedAllocas.clear();
}
}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 9d088547b436..dba3dba24e25 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -34,14 +34,14 @@ using namespace llvm;
/// ScalarOpts library.
void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeADCELegacyPassPass(Registry);
+ initializeAnnotationRemarksLegacyPass(Registry);
initializeBDCELegacyPassPass(Registry);
initializeAlignmentFromAssumptionsPass(Registry);
initializeCallSiteSplittingLegacyPassPass(Registry);
initializeConstantHoistingLegacyPassPass(Registry);
- initializeConstantPropagationPass(Registry);
+ initializeConstraintEliminationPass(Registry);
initializeCorrelatedValuePropagationPass(Registry);
initializeDCELegacyPassPass(Registry);
- initializeDeadInstEliminationPass(Registry);
initializeDivRemPairsLegacyPassPass(Registry);
initializeScalarizerLegacyPassPass(Registry);
initializeDSELegacyPassPass(Registry);
@@ -67,22 +67,24 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopDeletionLegacyPassPass(Registry);
initializeLoopAccessLegacyAnalysisPass(Registry);
initializeLoopInstSimplifyLegacyPassPass(Registry);
- initializeLoopInterchangePass(Registry);
+ initializeLoopInterchangeLegacyPassPass(Registry);
+ initializeLoopFlattenLegacyPassPass(Registry);
initializeLoopPredicationLegacyPassPass(Registry);
initializeLoopRotateLegacyPassPass(Registry);
initializeLoopStrengthReducePass(Registry);
- initializeLoopRerollPass(Registry);
+ initializeLoopRerollLegacyPassPass(Registry);
initializeLoopUnrollPass(Registry);
initializeLoopUnrollAndJamPass(Registry);
initializeLoopUnswitchPass(Registry);
initializeWarnMissedTransformationsLegacyPass(Registry);
- initializeLoopVersioningLICMPass(Registry);
+ initializeLoopVersioningLICMLegacyPassPass(Registry);
initializeLoopIdiomRecognizeLegacyPassPass(Registry);
initializeLowerAtomicLegacyPassPass(Registry);
initializeLowerConstantIntrinsicsPass(Registry);
initializeLowerExpectIntrinsicPass(Registry);
initializeLowerGuardIntrinsicLegacyPassPass(Registry);
initializeLowerMatrixIntrinsicsLegacyPassPass(Registry);
+ initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(Registry);
initializeLowerWidenableConditionLegacyPassPass(Registry);
initializeMemCpyOptLegacyPassPass(Registry);
initializeMergeICmpsLegacyPassPass(Registry);
@@ -91,25 +93,26 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
initializeReassociateLegacyPassPass(Registry);
initializeRedundantDbgInstEliminationPass(Registry);
- initializeRegToMemPass(Registry);
+ initializeRegToMemLegacyPass(Registry);
initializeRewriteStatepointsForGCLegacyPassPass(Registry);
+ initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
initializeSCCPLegacyPassPass(Registry);
initializeSROALegacyPassPass(Registry);
initializeCFGSimplifyPassPass(Registry);
- initializeStructurizeCFGPass(Registry);
+ initializeStructurizeCFGLegacyPassPass(Registry);
initializeSimpleLoopUnswitchLegacyPassPass(Registry);
initializeSinkingLegacyPassPass(Registry);
initializeTailCallElimPass(Registry);
- initializeSeparateConstOffsetFromGEPPass(Registry);
+ initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry);
initializeSpeculativeExecutionLegacyPassPass(Registry);
- initializeStraightLineStrengthReducePass(Registry);
+ initializeStraightLineStrengthReduceLegacyPassPass(Registry);
initializePlaceBackedgeSafepointsImplPass(Registry);
initializePlaceSafepointsPass(Registry);
initializeFloat2IntLegacyPassPass(Registry);
initializeLoopDistributeLegacyPass(Registry);
initializeLoopLoadEliminationPass(Registry);
initializeLoopSimplifyCFGLegacyPassPass(Registry);
- initializeLoopVersioningPassPass(Registry);
+ initializeLoopVersioningLegacyPassPass(Registry);
initializeEntryExitInstrumenterPass(Registry);
initializePostInlineEntryExitInstrumenterPass(Registry);
}
@@ -139,7 +142,7 @@ void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
}
void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createCFGSimplificationPass(1, false, false, true));
+ unwrap(PM)->add(createCFGSimplificationPass());
}
void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
@@ -166,6 +169,10 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createIndVarSimplifyPass());
}
+void LLVMAddInstructionSimplifyPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createInstSimplifyLegacyPass());
+}
+
void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createJumpThreadingPass());
}
@@ -182,6 +189,10 @@ void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopDeletionPass());
}
+void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopFlattenPass());
+}
+
void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopIdiomPass());
}
@@ -247,10 +258,6 @@ void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createTailCallEliminationPass());
}
-void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createConstantPropagationPass());
-}
-
void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createDemoteRegisterToMemoryPass());
}
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 000000000000..afa2d1bc7966
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,948 @@
+//===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===//
+// instrinsics
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrinLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ explicit ScalarizeMaskedMemIntrinLegacyPass() : FunctionPass(ID) {
+ initializeScalarizeMaskedMemIntrinLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "Scalarize Masked Memory Intrinsics";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT,
+ const TargetTransformInfo &TTI, const DataLayout &DL);
+static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL);
+
+char ScalarizeMaskedMemIntrinLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrinLegacyPass, DEBUG_TYPE,
+ "Scalarize unsupported masked memory intrinsics", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ScalarizeMaskedMemIntrinLegacyPass, DEBUG_TYPE,
+ "Scalarize unsupported masked memory intrinsics", false,
+ false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinLegacyPass() {
+ return new ScalarizeMaskedMemIntrinLegacyPass();
+}
+
+static bool isConstantIntVector(Value *Mask) {
+ Constant *C = dyn_cast<Constant>(Mask);
+ if (!C)
+ return false;
+
+ unsigned NumElts = cast<FixedVectorType>(Mask->getType())->getNumElements();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *CElt = C->getAggregateElement(i);
+ if (!CElt || !isa<ConstantInt>(CElt))
+ return false;
+ }
+
+ return true;
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// br i1 %2, label %cond.load, label %else
+//
+// cond.load: ; preds = %0
+// %3 = getelementptr i32* %1, i32 0
+// %4 = load i32* %3
+// %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0
+// br label %else
+//
+// else: ; preds = %0, %cond.load
+// %res.phi.else = phi <16 x i32> [ %5, %cond.load ], [ undef, %0 ]
+// %6 = extractelement <16 x i1> %mask, i32 1
+// br i1 %6, label %cond.load1, label %else2
+//
+// cond.load1: ; preds = %else
+// %7 = getelementptr i32* %1, i32 1
+// %8 = load i32* %7
+// %9 = insertelement <16 x i32> %res.phi.else, i32 %8, i32 1
+// br label %else2
+//
+// else2: ; preds = %else, %cond.load1
+// %res.phi.else3 = phi <16 x i32> [ %9, %cond.load1 ], [ %res.phi.else, %else ]
+// %10 = extractelement <16 x i1> %mask, i32 2
+// br i1 %10, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Alignment = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Value *Src0 = CI->getArgOperand(3);
+
+ const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+ VectorType *VecType = cast<FixedVectorType>(CI->getType());
+
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Short-cut if the mask is all-true.
+ if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
+ Value *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // Adjust alignment for the scalar instruction.
+ const Align AdjustedAlignVal =
+ commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
+ // Bitcast %addr from i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
+
+ // The result vector
+ Value *VResult = Src0;
+
+ if (isConstantIntVector(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+ continue;
+ Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+ LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
+ VResult = Builder.CreateInsertElement(VResult, Load, Idx);
+ }
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // If the mask is not v1i1, use scalar bit test operations. This generates
+ // better results on X86 at least.
+ Value *SclrMask;
+ if (VectorWidth != 1) {
+ Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+ SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+ // %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+ // %cond = icmp ne i16 %mask_1, 0
+ // br i1 %mask_1, label %cond.load, label %else
+ //
+ Value *Predicate;
+ if (VectorWidth != 1) {
+ Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+ Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+ Builder.getIntN(VectorWidth, 0));
+ } else {
+ Predicate = Builder.CreateExtractElement(Mask, Idx);
+ }
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+ "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+ LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
+ Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ BasicBlock *PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+
+ // Create the phi to join the new and previous value.
+ PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(NewVResult, CondBlock);
+ Phi->addIncoming(VResult, PrevIfBlock);
+ VResult = Phi;
+ }
+
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// br i1 %2, label %cond.store, label %else
+//
+// cond.store: ; preds = %0
+// %3 = extractelement <16 x i32> %val, i32 0
+// %4 = getelementptr i32* %1, i32 0
+// store i32 %3, i32* %4
+// br label %else
+//
+// else: ; preds = %0, %cond.store
+// %5 = extractelement <16 x i1> %mask, i32 1
+// br i1 %5, label %cond.store1, label %else2
+//
+// cond.store1: ; preds = %else
+// %6 = extractelement <16 x i32> %val, i32 1
+// %7 = getelementptr i32* %1, i32 1
+// store i32 %6, i32* %7
+// br label %else2
+// . . .
+static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ Value *Alignment = CI->getArgOperand(2);
+ Value *Mask = CI->getArgOperand(3);
+
+ const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+ auto *VecType = cast<VectorType>(Src->getType());
+
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Short-cut if the mask is all-true.
+ if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
+ Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // Adjust alignment for the scalar instruction.
+ const Align AdjustedAlignVal =
+ commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
+ // Bitcast %addr from i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
+
+ if (isConstantIntVector(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+ continue;
+ Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+ Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+ Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+
+ // If the mask is not v1i1, use scalar bit test operations. This generates
+ // better results on X86 at least.
+ Value *SclrMask;
+ if (VectorWidth != 1) {
+ Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+ SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+ // %cond = icmp ne i16 %mask_1, 0
+ // br i1 %mask_1, label %cond.store, label %else
+ //
+ Value *Predicate;
+ if (VectorWidth != 1) {
+ Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+ Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+ Builder.getIntN(VectorWidth, 0));
+ } else {
+ Predicate = Builder.CreateExtractElement(Mask, Idx);
+ }
+
+ // Create "cond" block
+ //
+ // %OneElt = extractelement <16 x i32> %Src, i32 Idx
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %store i32 %OneElt, i32* %EltAddr
+ //
+ BasicBlock *CondBlock =
+ IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+ Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+ Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+// <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// %Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// %Mask0 = extractelement <16 x i1> %Mask, i32 0
+// br i1 %Mask0, label %cond.load, label %else
+//
+// cond.load:
+// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// %Load0 = load i32, i32* %Ptr0, align 4
+// %Res0 = insertelement <16 x i32> undef, i32 %Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[%Res0, %cond.load], [undef, %0]
+// %Mask1 = extractelement <16 x i1> %Mask, i32 1
+// br i1 %Mask1, label %cond.load1, label %else2
+//
+// cond.load1:
+// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// %Load1 = load i32, i32* %Ptr1, align 4
+// %Res1 = insertelement <16 x i32> %res.phi.else, i32 %Load1, i32 1
+// br label %else2
+// . . .
+// %Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
+ Value *Ptrs = CI->getArgOperand(0);
+ Value *Alignment = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Value *Src0 = CI->getArgOperand(3);
+
+ auto *VecType = cast<FixedVectorType>(CI->getType());
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // The result vector
+ Value *VResult = Src0;
+ unsigned VectorWidth = VecType->getNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ if (isConstantIntVector(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+ continue;
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(EltTy, Ptr, AlignVal, "Load" + Twine(Idx));
+ VResult =
+ Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
+ }
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // If the mask is not v1i1, use scalar bit test operations. This generates
+ // better results on X86 at least.
+ Value *SclrMask;
+ if (VectorWidth != 1) {
+ Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+ SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+ // %cond = icmp ne i16 %mask_1, 0
+ // br i1 %Mask1, label %cond.load, label %else
+ //
+
+ Value *Predicate;
+ if (VectorWidth != 1) {
+ Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+ Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+ Builder.getIntN(VectorWidth, 0));
+ } else {
+ Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+ }
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(EltTy, Ptr, AlignVal, "Load" + Twine(Idx));
+ Value *NewVResult =
+ Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ BasicBlock *PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+
+ PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(NewVResult, CondBlock);
+ Phi->addIncoming(VResult, PrevIfBlock);
+ VResult = Phi;
+ }
+
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+// <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// %Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// %Mask0 = extractelement <16 x i1> %Mask, i32 0
+// br i1 %Mask0, label %cond.store, label %else
+//
+// cond.store:
+// %Elt0 = extractelement <16 x i32> %Src, i32 0
+// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* %Ptr0, align 4
+// br label %else
+//
+// else:
+// %Mask1 = extractelement <16 x i1> %Mask, i32 1
+// br i1 %Mask1, label %cond.store1, label %else2
+//
+// cond.store1:
+// %Elt1 = extractelement <16 x i32> %Src, i32 1
+// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 %Elt1, i32* %Ptr1, align 4
+// br label %else2
+// . . .
+static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptrs = CI->getArgOperand(1);
+ Value *Alignment = CI->getArgOperand(2);
+ Value *Mask = CI->getArgOperand(3);
+
+ auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
+
+ assert(
+ isa<VectorType>(Ptrs->getType()) &&
+ isa<PointerType>(cast<VectorType>(Ptrs->getType())->getElementType()) &&
+ "Vector of pointers is expected in masked scatter intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+ unsigned VectorWidth = SrcFVTy->getNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ if (isConstantIntVector(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+ continue;
+ Value *OneElt =
+ Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+ Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+
+ // If the mask is not v1i1, use scalar bit test operations. This generates
+ // better results on X86 at least.
+ Value *SclrMask;
+ if (VectorWidth != 1) {
+ Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+ SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+ // %cond = icmp ne i16 %mask_1, 0
+ // br i1 %Mask1, label %cond.store, label %else
+ //
+ Value *Predicate;
+ if (VectorWidth != 1) {
+ Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+ Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+ Builder.getIntN(VectorWidth, 0));
+ } else {
+ Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+ }
+
+ // Create "cond" block
+ //
+ // %Elt1 = extractelement <16 x i32> %Src, i32 1
+ // %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+ // %store i32 %Elt1, i32* %Ptr1
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+ Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Mask = CI->getArgOperand(1);
+ Value *PassThru = CI->getArgOperand(2);
+
+ auto *VecType = cast<FixedVectorType>(CI->getType());
+
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ unsigned VectorWidth = VecType->getNumElements();
+
+ // The result vector
+ Value *VResult = PassThru;
+
+ // Shorten the way if the mask is a vector of constants.
+ // Create a build_vector pattern, with loads/undefs as necessary and then
+ // shuffle blend with the pass through value.
+ if (isConstantIntVector(Mask)) {
+ unsigned MemIndex = 0;
+ VResult = UndefValue::get(VecType);
+ SmallVector<int, 16> ShuffleMask(VectorWidth, UndefMaskElem);
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ Value *InsertElt;
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue()) {
+ InsertElt = UndefValue::get(EltTy);
+ ShuffleMask[Idx] = Idx + VectorWidth;
+ } else {
+ Value *NewPtr =
+ Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+ InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
+ "Load" + Twine(Idx));
+ ShuffleMask[Idx] = Idx;
+ ++MemIndex;
+ }
+ VResult = Builder.CreateInsertElement(VResult, InsertElt, Idx,
+ "Res" + Twine(Idx));
+ }
+ VResult = Builder.CreateShuffleVector(VResult, PassThru, ShuffleMask);
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // If the mask is not v1i1, use scalar bit test operations. This generates
+ // better results on X86 at least.
+ Value *SclrMask;
+ if (VectorWidth != 1) {
+ Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+ SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // br i1 %mask_1, label %cond.load, label %else
+ //
+
+ Value *Predicate;
+ if (VectorWidth != 1) {
+ Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+ Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+ Builder.getIntN(VectorWidth, 0));
+ } else {
+ Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+ }
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+ "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1));
+ Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+ // Move the pointer if there are more blocks to come.
+ Value *NewPtr;
+ if ((Idx + 1) != VectorWidth)
+ NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ BasicBlock *PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+
+ // Create the phi to join the new and previous value.
+ PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ ResultPhi->addIncoming(NewVResult, CondBlock);
+ ResultPhi->addIncoming(VResult, PrevIfBlock);
+ VResult = ResultPhi;
+
+ // Add a PHI for the pointer if this isn't the last iteration.
+ if ((Idx + 1) != VectorWidth) {
+ PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+ PtrPhi->addIncoming(NewPtr, CondBlock);
+ PtrPhi->addIncoming(Ptr, PrevIfBlock);
+ Ptr = PtrPhi;
+ }
+ }
+
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+
+ auto *VecType = cast<FixedVectorType>(Src->getType());
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ Type *EltTy = VecType->getElementType();
+
+ unsigned VectorWidth = VecType->getNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ if (isConstantIntVector(Mask)) {
+ unsigned MemIndex = 0;
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+ continue;
+ Value *OneElt =
+ Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+ Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+ Builder.CreateAlignedStore(OneElt, NewPtr, Align(1));
+ ++MemIndex;
+ }
+ CI->eraseFromParent();
+ return;
+ }
+
+ // If the mask is not v1i1, use scalar bit test operations. This generates
+ // better results on X86 at least.
+ Value *SclrMask;
+ if (VectorWidth != 1) {
+ Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+ SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // br i1 %mask_1, label %cond.store, label %else
+ //
+ Value *Predicate;
+ if (VectorWidth != 1) {
+ Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+ Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+ Builder.getIntN(VectorWidth, 0));
+ } else {
+ Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+ }
+
+ // Create "cond" block
+ //
+ // %OneElt = extractelement <16 x i32> %Src, i32 Idx
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %store i32 %OneElt, i32* %EltAddr
+ //
+ BasicBlock *CondBlock =
+ IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+ Builder.CreateAlignedStore(OneElt, Ptr, Align(1));
+
+ // Move the pointer if there are more blocks to come.
+ Value *NewPtr;
+ if ((Idx + 1) != VectorWidth)
+ NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ BasicBlock *PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+
+ // Add a PHI for the pointer if this isn't the last iteration.
+ if ((Idx + 1) != VectorWidth) {
+ PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+ PtrPhi->addIncoming(NewPtr, CondBlock);
+ PtrPhi->addIncoming(Ptr, PrevIfBlock);
+ Ptr = PtrPhi;
+ }
+ }
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+static bool runImpl(Function &F, const TargetTransformInfo &TTI) {
+ bool EverMadeChange = false;
+ bool MadeChange = true;
+ auto &DL = F.getParent()->getDataLayout();
+ while (MadeChange) {
+ MadeChange = false;
+ for (Function::iterator I = F.begin(); I != F.end();) {
+ BasicBlock *BB = &*I++;
+ bool ModifiedDTOnIteration = false;
+ MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration, TTI, DL);
+
+ // Restart BB iteration if the dominator tree of the Function was changed
+ if (ModifiedDTOnIteration)
+ break;
+ }
+
+ EverMadeChange |= MadeChange;
+ }
+ return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrinLegacyPass::runOnFunction(Function &F) {
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return runImpl(F, TTI);
+}
+
+PreservedAnalyses
+ScalarizeMaskedMemIntrinPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ if (!runImpl(F, TTI))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<TargetIRAnalysis>();
+ return PA;
+}
+
+static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL) {
+ bool MadeChange = false;
+
+ BasicBlock::iterator CurInstIterator = BB.begin();
+ while (CurInstIterator != BB.end()) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+ MadeChange |= optimizeCallInst(CI, ModifiedDT, TTI, DL);
+ if (ModifiedDT)
+ return true;
+ }
+
+ return MadeChange;
+}
+
+static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ if (II) {
+ // The scalarization code below does not work for scalable vectors.
+ if (isa<ScalableVectorType>(II->getType()) ||
+ any_of(II->arg_operands(),
+ [](Value *V) { return isa<ScalableVectorType>(V->getType()); }))
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::masked_load:
+ // Scalarize unsupported vector masked load
+ if (TTI.isLegalMaskedLoad(
+ CI->getType(),
+ cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue()))
+ return false;
+ scalarizeMaskedLoad(CI, ModifiedDT);
+ return true;
+ case Intrinsic::masked_store:
+ if (TTI.isLegalMaskedStore(
+ CI->getArgOperand(0)->getType(),
+ cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue()))
+ return false;
+ scalarizeMaskedStore(CI, ModifiedDT);
+ return true;
+ case Intrinsic::masked_gather: {
+ unsigned AlignmentInt =
+ cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+ Type *LoadTy = CI->getType();
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), LoadTy);
+ if (TTI.isLegalMaskedGather(LoadTy, Alignment))
+ return false;
+ scalarizeMaskedGather(CI, ModifiedDT);
+ return true;
+ }
+ case Intrinsic::masked_scatter: {
+ unsigned AlignmentInt =
+ cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+ Type *StoreTy = CI->getArgOperand(0)->getType();
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), StoreTy);
+ if (TTI.isLegalMaskedScatter(StoreTy, Alignment))
+ return false;
+ scalarizeMaskedScatter(CI, ModifiedDT);
+ return true;
+ }
+ case Intrinsic::masked_expandload:
+ if (TTI.isLegalMaskedExpandLoad(CI->getType()))
+ return false;
+ scalarizeMaskedExpandLoad(CI, ModifiedDT);
+ return true;
+ case Intrinsic::masked_compressstore:
+ if (TTI.isLegalMaskedCompressStore(CI->getArgOperand(0)->getType()))
+ return false;
+ scalarizeMaskedCompressStore(CI, ModifiedDT);
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 851bd79cd6d8..c95984fe198f 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -398,7 +398,8 @@ void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
continue;
Instruction *Old = cast<Instruction>(V);
- CV[I]->takeName(Old);
+ if (isa<Instruction>(CV[I]))
+ CV[I]->takeName(Old);
Old->replaceAllUsesWith(CV[I]);
PotentiallyDeadInstrs.emplace_back(Old);
}
@@ -732,7 +733,7 @@ bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn);
unsigned Op0I = 0;
for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
- Value *V = UndefValue::get(MidTy);
+ Value *V = PoisonValue::get(MidTy);
for (unsigned MidI = 0; MidI < FanIn; ++MidI)
V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
BCI.getName() + ".i" + Twine(ResI)
@@ -931,7 +932,7 @@ bool ScalarizerVisitor::finish() {
if (!Op->use_empty()) {
// The value is still needed, so recreate it using a series of
// InsertElements.
- Value *Res = UndefValue::get(Op->getType());
+ Value *Res = PoisonValue::get(Op->getType());
if (auto *Ty = dyn_cast<VectorType>(Op->getType())) {
BasicBlock *BB = Op->getParent();
unsigned Count = cast<FixedVectorType>(Ty)->getNumElements();
@@ -941,13 +942,13 @@ bool ScalarizerVisitor::finish() {
for (unsigned I = 0; I < Count; ++I)
Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
Op->getName() + ".upto" + Twine(I));
+ Res->takeName(Op);
} else {
assert(CV.size() == 1 && Op->getType() == CV[0]->getType());
Res = CV[0];
if (Op == Res)
continue;
}
- Res->takeName(Op);
Op->replaceAllUsesWith(Res);
}
PotentiallyDeadInstrs.emplace_back(Op);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index f1d2e3c1ecfa..f216956406b6 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -155,6 +155,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
@@ -177,6 +178,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
@@ -342,13 +344,14 @@ private:
/// A pass that tries to split every GEP in the function into a variadic
/// base and a constant offset. It is a FunctionPass because searching for the
/// constant offset may inspect other basic blocks.
-class SeparateConstOffsetFromGEP : public FunctionPass {
+class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
public:
static char ID;
- SeparateConstOffsetFromGEP(bool LowerGEP = false)
+ SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
: FunctionPass(ID), LowerGEP(LowerGEP) {
- initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+ initializeSeparateConstOffsetFromGEPLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -360,14 +363,26 @@ public:
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
- bool doInitialization(Module &M) override {
- DL = &M.getDataLayout();
- return false;
- }
-
bool runOnFunction(Function &F) override;
private:
+ bool LowerGEP;
+};
+
+/// A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP {
+public:
+ SeparateConstOffsetFromGEP(
+ DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI,
+ TargetLibraryInfo *TLI,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP)
+ : DT(DT), SE(SE), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {}
+
+ bool run(Function &F);
+
+private:
/// Tries to split the given GEP into a variadic base and a constant offset,
/// and returns true if the splitting succeeds.
bool splitGEP(GetElementPtrInst *GEP);
@@ -450,9 +465,10 @@ private:
const DataLayout *DL = nullptr;
DominatorTree *DT = nullptr;
ScalarEvolution *SE;
-
LoopInfo *LI;
TargetLibraryInfo *TLI;
+ // Retrieved lazily since not always used.
+ function_ref<TargetTransformInfo &(Function &)> GetTTI;
/// Whether to lower a GEP with multiple indices into arithmetic operations or
/// multiple GEPs with a single index.
@@ -464,10 +480,10 @@ private:
} // end anonymous namespace
-char SeparateConstOffsetFromGEP::ID = 0;
+char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(
- SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+ SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
"Split GEPs to a variadic base and a constant offset for better CSE", false,
false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -476,12 +492,12 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(
- SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+ SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
"Split GEPs to a variadic base and a constant offset for better CSE", false,
false)
FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
- return new SeparateConstOffsetFromGEP(LowerGEP);
+ return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
}
bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -886,8 +902,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
// If we created a GEP with constant index, and the base is loop invariant,
// then we swap the first one with it, so LICM can move constant GEP out
// later.
- GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
- GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
+ auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
+ auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
swapGEPOperand(FirstGEP, SecondGEP);
@@ -962,8 +978,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (!NeedsExtraction)
return Changed;
- TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
+ TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
// If LowerGEP is disabled, before really splitting the GEP, check whether the
// backend supports the addressing mode we are about to produce. If no, this
@@ -1128,17 +1143,25 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
return true;
}
-bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+ return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ };
+ SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
+ return Impl.run(F);
+}
+bool SeparateConstOffsetFromGEP::run(Function &F) {
if (DisableSeparateConstOffsetFromGEP)
return false;
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ DL = &F.getParent()->getDataLayout();
bool Changed = false;
for (BasicBlock &B : F) {
for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
@@ -1345,3 +1368,20 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
} else
First->setIsInBounds(true);
}
+
+PreservedAnalyses
+SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *LI = &AM.getResult<LoopAnalysis>(F);
+ auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+ auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & {
+ return AM.getResult<TargetIRAnalysis>(F);
+ };
+ SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
+ if (!Impl.run(F))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 6c6d6ca9cf65..9d3c8d0f3739 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -26,16 +26,18 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
@@ -49,6 +51,7 @@
#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
@@ -93,6 +96,11 @@ static cl::opt<bool> UnswitchGuards(
"simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
cl::desc("If enabled, simple loop unswitching will also consider "
"llvm.experimental.guard intrinsics as unswitch candidates."));
+static cl::opt<bool> DropNonTrivialImplicitNullChecks(
+ "simple-loop-unswitch-drop-non-trivial-implicit-null-checks",
+ cl::init(false), cl::Hidden,
+ cl::desc("If enabled, drop make.implicit metadata in unswitched implicit "
+ "null checks to save time analyzing if we can keep it."));
/// Collect all of the loop invariant input values transitively used by the
/// homogeneous instruction graph from a given root.
@@ -684,11 +692,9 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
// successor.
BasicBlock *CommonSuccBB = nullptr;
if (SI.getNumCases() > 0 &&
- std::all_of(std::next(SI.case_begin()), SI.case_end(),
- [&SI](const SwitchInst::CaseHandle &Case) {
- return Case.getCaseSuccessor() ==
- SI.case_begin()->getCaseSuccessor();
- }))
+ all_of(drop_begin(SI.cases()), [&SI](const SwitchInst::CaseHandle &Case) {
+ return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor();
+ }))
CommonSuccBB = SI.case_begin()->getCaseSuccessor();
if (!DefaultExitBB) {
// If we're not unswitching the default, we need it to match any cases to
@@ -847,12 +853,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first});
DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second});
}
- DT.applyUpdates(DTUpdates);
if (MSSAU) {
- MSSAU->applyUpdates(DTUpdates, DT);
+ MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
if (VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
+ } else {
+ DT.applyUpdates(DTUpdates);
}
assert(DT.verify(DominatorTree::VerificationLevel::Fast));
@@ -1133,9 +1140,22 @@ static BasicBlock *buildClonedLoopBlocks(
// Replace the cloned branch with an unconditional branch to the cloned
// unswitched successor.
auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
- ClonedParentBB->getTerminator()->eraseFromParent();
+ Instruction *ClonedTerminator = ClonedParentBB->getTerminator();
+ // Trivial Simplification. If Terminator is a conditional branch and
+ // condition becomes dead - erase it.
+ Value *ClonedConditionToErase = nullptr;
+ if (auto *BI = dyn_cast<BranchInst>(ClonedTerminator))
+ ClonedConditionToErase = BI->getCondition();
+ else if (auto *SI = dyn_cast<SwitchInst>(ClonedTerminator))
+ ClonedConditionToErase = SI->getCondition();
+
+ ClonedTerminator->eraseFromParent();
BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+ if (ClonedConditionToErase)
+ RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr,
+ MSSAU);
+
// If there are duplicate entries in the PHI nodes because of multiple edges
// to the unswitched successor, we need to nuke all but one as we replaced it
// with a direct branch.
@@ -1194,7 +1214,7 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
LI.addTopLevelLoop(ClonedRootL);
AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
- if (OrigRootL.empty())
+ if (OrigRootL.isInnermost())
return ClonedRootL;
// If we have a nest, we can quickly clone the entire loop nest using an
@@ -2070,6 +2090,23 @@ static void unswitchNontrivialInvariants(
DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
}
+ // Drop metadata if we may break its semantics by moving this instr into the
+ // split block.
+ if (TI.getMetadata(LLVMContext::MD_make_implicit)) {
+ if (DropNonTrivialImplicitNullChecks)
+ // Do not spend time trying to understand if we can keep it, just drop it
+ // to save compile time.
+ TI.setMetadata(LLVMContext::MD_make_implicit, nullptr);
+ else {
+ // It is only legal to preserve make.implicit metadata if we are
+ // guaranteed no reach implicit null check after following this branch.
+ ICFLoopSafetyInfo SafetyInfo;
+ SafetyInfo.computeLoopSafetyInfo(&L);
+ if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, &L))
+ TI.setMetadata(LLVMContext::MD_make_implicit, nullptr);
+ }
+ }
+
// The stitching of the branched code back together depends on whether we're
// doing full unswitching or not with the exception that we always want to
// nuke the initial terminator placed in the split block.
@@ -2316,12 +2353,12 @@ static void unswitchNontrivialInvariants(
for (Loop *UpdatedL :
llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
UpdateLoop(*UpdatedL);
- if (!UpdatedL->getParentLoop())
+ if (UpdatedL->isOutermost())
OuterExitL = nullptr;
}
if (IsStillLoop) {
UpdateLoop(L);
- if (!L.getParentLoop())
+ if (L.isOutermost())
OuterExitL = nullptr;
}
@@ -2669,6 +2706,10 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
// (convergent, noduplicate, or cross-basic-block tokens).
// FIXME: We might be able to safely handle some of these in non-duplicated
// regions.
+ TargetTransformInfo::TargetCostKind CostKind =
+ L.getHeader()->getParent()->hasMinSize()
+ ? TargetTransformInfo::TCK_CodeSize
+ : TargetTransformInfo::TCK_SizeAndLatency;
int LoopCost = 0;
for (auto *BB : L.blocks()) {
int Cost = 0;
@@ -2682,7 +2723,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
if (CB->isConvergent() || CB->cannotDuplicate())
return false;
- Cost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
+ Cost += TTI.getUserCost(&I, CostKind);
}
assert(Cost >= 0 && "Must not have negative costs!");
LoopCost += Cost;
@@ -2844,7 +2885,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
assert(L.isRecursivelyLCSSAForm(DT, LI) &&
"Loops must be in LCSSA form before unswitching.");
- bool Changed = false;
// Must be in loop simplified form: we need a preheader and dedicated exits.
if (!L.isLoopSimplifyForm())
@@ -2864,6 +2904,10 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
if (!NonTrivial && !EnableNonTrivialUnswitch)
return false;
+ // Skip non-trivial unswitching for optsize functions.
+ if (L.getHeader()->getParent()->hasOptSize())
+ return false;
+
// For non-trivial unswitching, because it often creates new loops, we rely on
// the pass manager to iterate on the loops rather than trying to immediately
// reach a fixed point. There is no substantial advantage to iterating
@@ -2876,7 +2920,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
return true;
// No other opportunities to unswitch.
- return Changed;
+ return false;
}
PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 2e459c9a64d4..38e7109ead57 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -25,21 +25,25 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/SimplifyCFG.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
#include <utility>
using namespace llvm;
@@ -61,6 +65,10 @@ static cl::opt<bool> UserForwardSwitchCond(
"forward-switch-cond", cl::Hidden, cl::init(false),
cl::desc("Forward switch condition to phi ops (default = false)"));
+static cl::opt<bool> UserHoistCommonInsts(
+ "hoist-common-insts", cl::Hidden, cl::init(false),
+ cl::desc("hoist common instructions (default = false)"));
+
static cl::opt<bool> UserSinkCommonInsts(
"sink-common-insts", cl::Hidden, cl::init(false),
cl::desc("Sink common instructions (default = false)"));
@@ -70,14 +78,18 @@ STATISTIC(NumSimpl, "Number of blocks simplified");
/// If we have more than one empty (other than phi node) return blocks,
/// merge them together to promote recursive block merging.
-static bool mergeEmptyReturnBlocks(Function &F) {
+static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
bool Changed = false;
+ std::vector<DominatorTree::UpdateType> Updates;
+ SmallVector<BasicBlock *, 8> DeadBlocks;
+
BasicBlock *RetBlock = nullptr;
// Scan all the blocks in the function, looking for empty return blocks.
- for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
- BasicBlock &BB = *BBI++;
+ for (BasicBlock &BB : make_early_inc_range(F)) {
+ if (DTU && DTU->isBBPendingDeletion(&BB))
+ continue;
// Only look at return blocks.
ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
@@ -128,8 +140,18 @@ static bool mergeEmptyReturnBlocks(Function &F) {
if (Ret->getNumOperands() == 0 ||
Ret->getOperand(0) ==
cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+ // All predecessors of BB should now branch to RetBlock instead.
+ if (DTU) {
+ for (auto *Predecessor : predecessors(&BB)) {
+ // But, iff Predecessor already branches to RetBlock,
+ // don't (re-)add DomTree edge, because it already exists.
+ if (!is_contained(successors(Predecessor), RetBlock))
+ Updates.push_back({DominatorTree::Insert, Predecessor, RetBlock});
+ Updates.push_back({DominatorTree::Delete, Predecessor, &BB});
+ }
+ }
BB.replaceAllUsesWith(RetBlock);
- BB.eraseFromParent();
+ DeadBlocks.emplace_back(&BB);
continue;
}
@@ -153,6 +175,17 @@ static bool mergeEmptyReturnBlocks(Function &F) {
RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
BB.getTerminator()->eraseFromParent();
BranchInst::Create(RetBlock, &BB);
+ if (DTU)
+ Updates.push_back({DominatorTree::Insert, &BB, RetBlock});
+ }
+
+ if (DTU) {
+ DTU->applyUpdates(Updates);
+ for (auto *BB : DeadBlocks)
+ DTU->deleteBB(BB);
+ } else {
+ for (auto *BB : DeadBlocks)
+ BB->eraseFromParent();
}
return Changed;
@@ -161,22 +194,36 @@ static bool mergeEmptyReturnBlocks(Function &F) {
/// Call SimplifyCFG on all the blocks in the function,
/// iterating until no more changes are made.
static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+ DomTreeUpdater *DTU,
const SimplifyCFGOptions &Options) {
bool Changed = false;
bool LocalChange = true;
SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
FindFunctionBackedges(F, Edges);
- SmallPtrSet<BasicBlock *, 16> LoopHeaders;
+ SmallPtrSet<BasicBlock *, 16> UniqueLoopHeaders;
for (unsigned i = 0, e = Edges.size(); i != e; ++i)
- LoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+ UniqueLoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+
+ SmallVector<WeakVH, 16> LoopHeaders(UniqueLoopHeaders.begin(),
+ UniqueLoopHeaders.end());
while (LocalChange) {
LocalChange = false;
// Loop over all of the basic blocks and remove them if they are unneeded.
for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
- if (simplifyCFG(&*BBIt++, TTI, Options, &LoopHeaders)) {
+ BasicBlock &BB = *BBIt++;
+ if (DTU) {
+ assert(
+ !DTU->isBBPendingDeletion(&BB) &&
+ "Should not end up trying to simplify blocks marked for removal.");
+ // Make sure that the advanced iterator does not point at the blocks
+ // that are marked for removal, skip over all such blocks.
+ while (BBIt != F.end() && DTU->isBBPendingDeletion(&*BBIt))
+ ++BBIt;
+ }
+ if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) {
LocalChange = true;
++NumSimpl;
}
@@ -186,11 +233,14 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
return Changed;
}
-static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
- const SimplifyCFGOptions &Options) {
- bool EverChanged = removeUnreachableBlocks(F);
- EverChanged |= mergeEmptyReturnBlocks(F);
- EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
+static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
+ DominatorTree *DT,
+ const SimplifyCFGOptions &Options) {
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+ bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
+ EverChanged |= mergeEmptyReturnBlocks(F, DT ? &DTU : nullptr);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
// If neither pass changed anything, we're done.
if (!EverChanged) return false;
@@ -200,43 +250,75 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
// iterate between the two optimizations. We structure the code like this to
// avoid rerunning iterativelySimplifyCFG if the second pass of
// removeUnreachableBlocks doesn't do anything.
- if (!removeUnreachableBlocks(F))
+ if (!removeUnreachableBlocks(F, DT ? &DTU : nullptr))
return true;
do {
- EverChanged = iterativelySimplifyCFG(F, TTI, Options);
- EverChanged |= removeUnreachableBlocks(F);
+ EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
+ EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr);
} while (EverChanged);
return true;
}
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+ DominatorTree *DT,
+ const SimplifyCFGOptions &Options) {
+ assert((!RequireAndPreserveDomTree ||
+ (DT && DT->verify(DominatorTree::VerificationLevel::Full))) &&
+ "Original domtree is invalid?");
+
+ bool Changed = simplifyFunctionCFGImpl(F, TTI, DT, Options);
+
+ assert((!RequireAndPreserveDomTree ||
+ (DT && DT->verify(DominatorTree::VerificationLevel::Full))) &&
+ "Failed to maintain validity of domtree!");
+
+ return Changed;
+}
+
// Command-line settings override compile-time settings.
-SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) {
- Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
- ? UserBonusInstThreshold
- : Opts.BonusInstThreshold;
- Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
- ? UserForwardSwitchCond
- : Opts.ForwardSwitchCondToPhi;
- Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
- ? UserSwitchToLookup
- : Opts.ConvertSwitchToLookupTable;
- Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences()
- ? UserKeepLoops
- : Opts.NeedCanonicalLoop;
- Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
- ? UserSinkCommonInsts
- : Opts.SinkCommonInsts;
+static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
+ if (UserBonusInstThreshold.getNumOccurrences())
+ Options.BonusInstThreshold = UserBonusInstThreshold;
+ if (UserForwardSwitchCond.getNumOccurrences())
+ Options.ForwardSwitchCondToPhi = UserForwardSwitchCond;
+ if (UserSwitchToLookup.getNumOccurrences())
+ Options.ConvertSwitchToLookupTable = UserSwitchToLookup;
+ if (UserKeepLoops.getNumOccurrences())
+ Options.NeedCanonicalLoop = UserKeepLoops;
+ if (UserHoistCommonInsts.getNumOccurrences())
+ Options.HoistCommonInsts = UserHoistCommonInsts;
+ if (UserSinkCommonInsts.getNumOccurrences())
+ Options.SinkCommonInsts = UserSinkCommonInsts;
+}
+
+SimplifyCFGPass::SimplifyCFGPass() : Options() {
+ applyCommandLineOverridesToOptions(Options);
+}
+
+SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts)
+ : Options(Opts) {
+ applyCommandLineOverridesToOptions(Options);
}
PreservedAnalyses SimplifyCFGPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
Options.AC = &AM.getResult<AssumptionAnalysis>(F);
- if (!simplifyFunctionCFG(F, TTI, Options))
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
+ Options.setSimplifyCondBranch(false).setFoldTwoEntryPHINode(false);
+ } else {
+ Options.setSimplifyCondBranch(true).setFoldTwoEntryPHINode(true);
+ }
+ if (!simplifyFunctionCFG(F, TTI, DT, Options))
return PreservedAnalyses::all();
PreservedAnalyses PA;
+ if (RequireAndPreserveDomTree)
+ PA.preserve<DominatorTreeAnalysis>();
PA.preserve<GlobalsAA>();
return PA;
}
@@ -247,33 +329,14 @@ struct CFGSimplifyPass : public FunctionPass {
SimplifyCFGOptions Options;
std::function<bool(const Function &)> PredicateFtor;
- CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false,
- bool ConvertSwitch = false, bool KeepLoops = true,
- bool SinkCommon = false,
+ CFGSimplifyPass(SimplifyCFGOptions Options_ = SimplifyCFGOptions(),
std::function<bool(const Function &)> Ftor = nullptr)
- : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+ : FunctionPass(ID), Options(Options_), PredicateFtor(std::move(Ftor)) {
initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
// Check for command-line overrides of options for debug/customization.
- Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
- ? UserBonusInstThreshold
- : Threshold;
-
- Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
- ? UserForwardSwitchCond
- : ForwardSwitchCond;
-
- Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
- ? UserSwitchToLookup
- : ConvertSwitch;
-
- Options.NeedCanonicalLoop =
- UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops;
-
- Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
- ? UserSinkCommonInsts
- : SinkCommon;
+ applyCommandLineOverridesToOptions(Options);
}
bool runOnFunction(Function &F) override {
@@ -281,6 +344,9 @@ struct CFGSimplifyPass : public FunctionPass {
return false;
Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
Options.setSimplifyCondBranch(false)
.setFoldTwoEntryPHINode(false);
@@ -290,11 +356,15 @@ struct CFGSimplifyPass : public FunctionPass {
}
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return simplifyFunctionCFG(F, TTI, Options);
+ return simplifyFunctionCFG(F, TTI, DT, Options);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
+ if (RequireAndPreserveDomTree)
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (RequireAndPreserveDomTree)
+ AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
@@ -305,15 +375,13 @@ INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
false)
// Public interface to the CFGSimplification pass
FunctionPass *
-llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond,
- bool ConvertSwitch, bool KeepLoops,
- bool SinkCommon,
+llvm::createCFGSimplificationPass(SimplifyCFGOptions Options,
std::function<bool(const Function &)> Ftor) {
- return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch,
- KeepLoops, SinkCommon, std::move(Ftor));
+ return new CFGSimplifyPass(Options, std::move(Ftor));
}
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 48f289c8f17d..89cfbe384be4 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -32,31 +32,6 @@ using namespace llvm;
STATISTIC(NumSunk, "Number of instructions sunk");
STATISTIC(NumSinkIter, "Number of sinking iterations");
-/// AllUsesDominatedByBlock - Return true if all uses of the specified value
-/// occur in blocks dominated by the specified block.
-static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
- DominatorTree &DT) {
- // Ignoring debug uses is necessary so debug info doesn't affect the code.
- // This may leave a referencing dbg_value in the original block, before
- // the definition of the vreg. Dwarf generator handles this although the
- // user might not get the right info at runtime.
- for (Use &U : Inst->uses()) {
- // Determine the block of the use.
- Instruction *UseInst = cast<Instruction>(U.getUser());
- BasicBlock *UseBlock = UseInst->getParent();
- if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
- // PHI nodes use the operand in the predecessor block, not the block with
- // the PHI.
- unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
- UseBlock = PN->getIncomingBlock(Num);
- }
- // Check that it dominates.
- if (!DT.dominates(BB, UseBlock))
- return false;
- }
- return true;
-}
-
static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
SmallPtrSetImpl<Instruction *> &Stores) {
@@ -97,11 +72,6 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
assert(Inst && "Instruction to be sunk is null");
assert(SuccToSinkTo && "Candidate sink target is null");
- // It is not possible to sink an instruction into its own block. This can
- // happen with loops.
- if (Inst->getParent() == SuccToSinkTo)
- return false;
-
// It's never legal to sink an instruction into a block which terminates in an
// EH-pad.
if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
@@ -129,9 +99,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
return false;
}
- // Finally, check that all the uses of the instruction are actually
- // dominated by the candidate
- return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
+ return true;
}
/// SinkInstruction - Determine whether it is safe to sink the specified machine
@@ -162,25 +130,37 @@ static bool SinkInstruction(Instruction *Inst,
// decide.
BasicBlock *SuccToSinkTo = nullptr;
- // Instructions can only be sunk if all their uses are in blocks
- // dominated by one of the successors.
- // Look at all the dominated blocks and see if we can sink it in one.
- DomTreeNode *DTN = DT.getNode(Inst->getParent());
- for (auto I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr;
- ++I) {
- BasicBlock *Candidate = (*I)->getBlock();
- // A node always immediate-dominates its children on the dominator
- // tree.
- if (IsAcceptableTarget(Inst, Candidate, DT, LI))
- SuccToSinkTo = Candidate;
+ // Find the nearest common dominator of all users as the candidate.
+ BasicBlock *BB = Inst->getParent();
+ for (Use &U : Inst->uses()) {
+ Instruction *UseInst = cast<Instruction>(U.getUser());
+ BasicBlock *UseBlock = UseInst->getParent();
+ // Don't worry about dead users.
+ if (!DT.isReachableFromEntry(UseBlock))
+ continue;
+ if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+ // PHI nodes use the operand in the predecessor block, not the block with
+ // the PHI.
+ unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+ UseBlock = PN->getIncomingBlock(Num);
+ }
+ if (SuccToSinkTo)
+ SuccToSinkTo = DT.findNearestCommonDominator(SuccToSinkTo, UseBlock);
+ else
+ SuccToSinkTo = UseBlock;
+ // The current basic block needs to dominate the candidate.
+ if (!DT.dominates(BB, SuccToSinkTo))
+ return false;
}
- // If no suitable postdominator was found, look at all the successors and
- // decide which one we should sink to, if any.
- for (succ_iterator I = succ_begin(Inst->getParent()),
- E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
- if (IsAcceptableTarget(Inst, *I, DT, LI))
- SuccToSinkTo = *I;
+ if (SuccToSinkTo) {
+ // The nearest common dominator may be in a parent loop of BB, which may not
+ // be beneficial. Find an ancestor.
+ while (SuccToSinkTo != BB &&
+ !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+ SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
+ if (SuccToSinkTo == BB)
+ SuccToSinkTo = nullptr;
}
// If we couldn't find a block to sink to, ignore this instruction.
diff --git a/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index 8258b92a716d..9b18c945d950 100644
--- a/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -756,13 +756,10 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
// For each PHI node in this block, check whether there are immediate folding
// opportunities from speculation, and whether that speculation will be
// valid. This determise the set of safe PHIs to speculate.
- PNs.erase(llvm::remove_if(PNs,
- [&](PHINode *PN) {
- return !isSafeAndProfitableToSpeculateAroundPHI(
- *PN, CostSavingsMap, PotentialSpecSet,
- UnsafeSet, DT, TTI);
- }),
- PNs.end());
+ llvm::erase_if(PNs, [&](PHINode *PN) {
+ return !isSafeAndProfitableToSpeculateAroundPHI(
+ *PN, CostSavingsMap, PotentialSpecSet, UnsafeSet, DT, TTI);
+ });
// If no PHIs were profitable, skip.
if (PNs.empty()) {
LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f82a2936c762..c78185f2a6ad 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -245,6 +245,13 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
case Instruction::FNeg:
case Instruction::ICmp:
case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::Freeze:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
default:
@@ -274,7 +281,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
for (const Value *V : U->operand_values()) {
if (const Instruction *I = dyn_cast<Instruction>(V)) {
- if (NotHoisted.count(I) > 0)
+ if (NotHoisted.contains(I))
return false;
}
}
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 9f82b1263ebd..577992ccb5f4 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -55,6 +55,7 @@
// - When (i' - i) is constant but i and i' are not, we could still perform
// SLSR.
+#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallVector.h"
@@ -95,8 +96,39 @@ static const unsigned UnknownAddressSpace =
namespace {
-class StraightLineStrengthReduce : public FunctionPass {
+class StraightLineStrengthReduceLegacyPass : public FunctionPass {
+ const DataLayout *DL = nullptr;
+
public:
+ static char ID;
+
+ StraightLineStrengthReduceLegacyPass() : FunctionPass(ID) {
+ initializeStraightLineStrengthReduceLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ // We do not modify the shape of the CFG.
+ AU.setPreservesCFG();
+ }
+
+ bool doInitialization(Module &M) override {
+ DL = &M.getDataLayout();
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+
+class StraightLineStrengthReduce {
+public:
+ StraightLineStrengthReduce(const DataLayout *DL, DominatorTree *DT,
+ ScalarEvolution *SE, TargetTransformInfo *TTI)
+ : DL(DL), DT(DT), SE(SE), TTI(TTI) {}
+
// SLSR candidate. Such a candidate must be in one of the forms described in
// the header comments.
struct Candidate {
@@ -144,26 +176,7 @@ public:
Candidate *Basis = nullptr;
};
- static char ID;
-
- StraightLineStrengthReduce() : FunctionPass(ID) {
- initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- // We do not modify the shape of the CFG.
- AU.setPreservesCFG();
- }
-
- bool doInitialization(Module &M) override {
- DL = &M.getDataLayout();
- return false;
- }
-
- bool runOnFunction(Function &F) override;
+ bool runOnFunction(Function &F);
private:
// Returns true if Basis is a basis for C, i.e., Basis dominates C and they
@@ -243,18 +256,18 @@ private:
} // end anonymous namespace
-char StraightLineStrengthReduce::ID = 0;
+char StraightLineStrengthReduceLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduceLegacyPass, "slsr",
"Straight line strength reduction", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+INITIALIZE_PASS_END(StraightLineStrengthReduceLegacyPass, "slsr",
"Straight line strength reduction", false, false)
FunctionPass *llvm::createStraightLineStrengthReducePass() {
- return new StraightLineStrengthReduce();
+ return new StraightLineStrengthReduceLegacyPass();
}
bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
@@ -272,9 +285,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
static bool isGEPFoldable(GetElementPtrInst *GEP,
const TargetTransformInfo *TTI) {
- SmallVector<const Value*, 4> Indices;
- for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
- Indices.push_back(*I);
+ SmallVector<const Value *, 4> Indices(GEP->indices());
return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
Indices) == TargetTransformInfo::TCC_Free;
}
@@ -704,13 +715,17 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
UnlinkedInstructions.push_back(C.Ins);
}
-bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ return StraightLineStrengthReduce(DL, DT, SE, TTI).runOnFunction(F);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
// Traverse the dominator tree in the depth-first order. This order makes sure
// all bases of a candidate are in Candidates when we process it.
for (const auto Node : depth_first(DT))
@@ -740,3 +755,25 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
UnlinkedInstructions.clear();
return Ret;
}
+
+namespace llvm {
+
+PreservedAnalyses
+StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
+ const DataLayout *DL = &F.getParent()->getDataLayout();
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ if (!StraightLineStrengthReduce(DL, DT, SE, TTI).runOnFunction(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<TargetIRAnalysis>();
+ return PA;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index c20e57b02c1a..3e15cad5f3f3 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/StructurizeCFG.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SCCIterator.h"
@@ -28,6 +29,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
@@ -55,7 +57,7 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "structurizecfg"
// The name for newly created blocks.
-static const char *const FlowBlockName = "Flow";
+const char FlowBlockName[] = "Flow";
namespace {
@@ -233,9 +235,8 @@ public:
/// while the true side continues the general flow. So the loop condition
/// consist of a network of PHI nodes where the true incoming values expresses
/// breaks and the false values expresses continue states.
-class StructurizeCFG : public RegionPass {
- bool SkipUniformRegions;
+class StructurizeCFG {
Type *Boolean;
ConstantInt *BoolTrue;
ConstantInt *BoolFalse;
@@ -244,7 +245,7 @@ class StructurizeCFG : public RegionPass {
Function *Func;
Region *ParentRegion;
- LegacyDivergenceAnalysis *DA;
+ LegacyDivergenceAnalysis *DA = nullptr;
DominatorTree *DT;
SmallVector<RegionNode *, 8> Order;
@@ -309,19 +310,35 @@ class StructurizeCFG : public RegionPass {
void rebuildSSA();
public:
+ void init(Region *R);
+ bool run(Region *R, DominatorTree *DT);
+ bool makeUniformRegion(Region *R, LegacyDivergenceAnalysis *DA);
+};
+
+class StructurizeCFGLegacyPass : public RegionPass {
+ bool SkipUniformRegions;
+
+public:
static char ID;
- explicit StructurizeCFG(bool SkipUniformRegions_ = false)
- : RegionPass(ID),
- SkipUniformRegions(SkipUniformRegions_) {
+ explicit StructurizeCFGLegacyPass(bool SkipUniformRegions_ = false)
+ : RegionPass(ID), SkipUniformRegions(SkipUniformRegions_) {
if (ForceSkipUniformRegions.getNumOccurrences())
SkipUniformRegions = ForceSkipUniformRegions.getValue();
- initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+ initializeStructurizeCFGLegacyPassPass(*PassRegistry::getPassRegistry());
}
- bool doInitialization(Region *R, RGPassManager &RGM) override;
-
- bool runOnRegion(Region *R, RGPassManager &RGM) override;
+ bool runOnRegion(Region *R, RGPassManager &RGM) override {
+ StructurizeCFG SCFG;
+ SCFG.init(R);
+ if (SkipUniformRegions) {
+ LegacyDivergenceAnalysis *DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ if (SCFG.makeUniformRegion(R, DA))
+ return false;
+ }
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return SCFG.run(R, DT);
+ }
StringRef getPassName() const override { return "Structurize control flow"; }
@@ -338,28 +355,16 @@ public:
} // end anonymous namespace
-char StructurizeCFG::ID = 0;
+char StructurizeCFGLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
- false, false)
+INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
+ "Structurize the CFG", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
-INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
- false, false)
-
-/// Initialize the types and constants used in the pass
-bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
- LLVMContext &Context = R->getEntry()->getContext();
-
- Boolean = Type::getInt1Ty(Context);
- BoolTrue = ConstantInt::getTrue(Context);
- BoolFalse = ConstantInt::getFalse(Context);
- BoolUndef = UndefValue::get(Boolean);
-
- return false;
-}
+INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
+ "Structurize the CFG", false, false)
/// Build up the general order of nodes, by performing a topological sort of the
/// parent region's nodes, while ensuring that there is no outer cycle node
@@ -1003,48 +1008,62 @@ static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
return SubRegionsAreUniform || (ConditionalDirectChildren <= 1);
}
-/// Run the transformation for each region found
-bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+void StructurizeCFG::init(Region *R) {
+ LLVMContext &Context = R->getEntry()->getContext();
+
+ Boolean = Type::getInt1Ty(Context);
+ BoolTrue = ConstantInt::getTrue(Context);
+ BoolFalse = ConstantInt::getFalse(Context);
+ BoolUndef = UndefValue::get(Boolean);
+
+ this->DA = nullptr;
+}
+
+bool StructurizeCFG::makeUniformRegion(Region *R,
+ LegacyDivergenceAnalysis *DA) {
if (R->isTopLevelRegion())
return false;
- DA = nullptr;
-
- if (SkipUniformRegions) {
- // TODO: We could probably be smarter here with how we handle sub-regions.
- // We currently rely on the fact that metadata is set by earlier invocations
- // of the pass on sub-regions, and that this metadata doesn't get lost --
- // but we shouldn't rely on metadata for correctness!
- unsigned UniformMDKindID =
- R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
-
- if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
- LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
- << '\n');
-
- // Mark all direct child block terminators as having been treated as
- // uniform. To account for a possible future in which non-uniform
- // sub-regions are treated more cleverly, indirect children are not
- // marked as uniform.
- MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
- for (RegionNode *E : R->elements()) {
- if (E->isSubRegion())
- continue;
-
- if (Instruction *Term = E->getEntry()->getTerminator())
- Term->setMetadata(UniformMDKindID, MD);
- }
+ this->DA = DA;
+ // TODO: We could probably be smarter here with how we handle sub-regions.
+ // We currently rely on the fact that metadata is set by earlier invocations
+ // of the pass on sub-regions, and that this metadata doesn't get lost --
+ // but we shouldn't rely on metadata for correctness!
+ unsigned UniformMDKindID =
+ R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
+
+ if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
+ LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
+ << '\n');
+
+ // Mark all direct child block terminators as having been treated as
+ // uniform. To account for a possible future in which non-uniform
+ // sub-regions are treated more cleverly, indirect children are not
+ // marked as uniform.
+ MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
+ for (RegionNode *E : R->elements()) {
+ if (E->isSubRegion())
+ continue;
- return false;
+ if (Instruction *Term = E->getEntry()->getTerminator())
+ Term->setMetadata(UniformMDKindID, MD);
}
+
+ return true;
}
+ return false;
+}
+
+/// Run the transformation for each region found
+bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
+ if (R->isTopLevelRegion())
+ return false;
+
+ this->DT = DT;
Func = R->getEntry()->getParent();
ParentRegion = R;
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
orderNodes();
collectInfos();
createFlow();
@@ -1069,5 +1088,33 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
}
Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
- return new StructurizeCFG(SkipUniformRegions);
+ return new StructurizeCFGLegacyPass(SkipUniformRegions);
+}
+
+static void addRegionIntoQueue(Region &R, std::vector<Region *> &Regions) {
+ Regions.push_back(&R);
+ for (const auto &E : R)
+ addRegionIntoQueue(*E, Regions);
+}
+
+PreservedAnalyses StructurizeCFGPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ bool Changed = false;
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto &RI = AM.getResult<RegionInfoAnalysis>(F);
+ std::vector<Region *> Regions;
+ addRegionIntoQueue(*RI.getTopLevelRegion(), Regions);
+ while (!Regions.empty()) {
+ Region *R = Regions.back();
+ StructurizeCFG SCFG;
+ SCFG.init(R);
+ Changed |= SCFG.run(R, DT);
+ Regions.pop_back();
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
}
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 5bb1d54d7d12..9e7cccc88412 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -92,7 +92,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
/// Scan the specified function for alloca instructions.
/// If it contains any dynamic allocas, returns false.
static bool canTRE(Function &F) {
- // Because of PR962, we don't TRE dynamic allocas.
+ // FIXME: The code generator produces really bad code when an 'escaping
+ // alloca' is changed from being a static alloca to being a dynamic alloca.
+ // Until this is resolved, disable this transformation if that would ever
+ // happen. This bug is PR962.
return llvm::all_of(instructions(F), [](Instruction &I) {
auto *AI = dyn_cast<AllocaInst>(&I);
return !AI || AI->isStaticAlloca();
@@ -237,7 +240,11 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
Escaped = ESCAPED;
CallInst *CI = dyn_cast<CallInst>(&I);
- if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I))
+ // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
+ // considered accessing memory and will be marked as a tail call if we
+ // don't bail out here.
+ if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
+ isa<PseudoProbeInst>(&I))
continue;
bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
@@ -279,7 +286,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
}
}
- for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+ for (auto *SuccBB : successors(BB)) {
auto &State = Visited[SuccBB];
if (State < Escaped) {
State = Escaped;
@@ -419,7 +426,7 @@ class TailRecursionEliminator {
DomTreeUpdater &DTU)
: F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
- CallInst *findTRECandidate(Instruction *TI,
+ CallInst *findTRECandidate(BasicBlock *BB,
bool CannotTailCallElimCallsMarkedTail);
void createTailRecurseLoopHeader(CallInst *CI);
@@ -428,14 +435,10 @@ class TailRecursionEliminator {
bool eliminateCall(CallInst *CI);
- bool foldReturnAndProcessPred(ReturnInst *Ret,
- bool CannotTailCallElimCallsMarkedTail);
-
- bool processReturningBlock(ReturnInst *Ret,
- bool CannotTailCallElimCallsMarkedTail);
-
void cleanupAndFinalize();
+ bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail);
+
public:
static bool eliminate(Function &F, const TargetTransformInfo *TTI,
AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
@@ -444,8 +447,8 @@ public:
} // namespace
CallInst *TailRecursionEliminator::findTRECandidate(
- Instruction *TI, bool CannotTailCallElimCallsMarkedTail) {
- BasicBlock *BB = TI->getParent();
+ BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) {
+ Instruction *TI = BB->getTerminator();
if (&BB->front() == TI) // Make sure there is something before the terminator.
return nullptr;
@@ -672,63 +675,6 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
return true;
}
-bool TailRecursionEliminator::foldReturnAndProcessPred(
- ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
- BasicBlock *BB = Ret->getParent();
-
- bool Change = false;
-
- // Make sure this block is a trivial return block.
- assert(BB->getFirstNonPHIOrDbg() == Ret &&
- "Trying to fold non-trivial return block");
-
- // If the return block contains nothing but the return and PHI's,
- // there might be an opportunity to duplicate the return in its
- // predecessors and perform TRE there. Look for predecessors that end
- // in unconditional branch and recursive call(s).
- SmallVector<BranchInst*, 8> UncondBranchPreds;
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- BasicBlock *Pred = *PI;
- Instruction *PTI = Pred->getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
- if (BI->isUnconditional())
- UncondBranchPreds.push_back(BI);
- }
-
- while (!UncondBranchPreds.empty()) {
- BranchInst *BI = UncondBranchPreds.pop_back_val();
- BasicBlock *Pred = BI->getParent();
- if (CallInst *CI =
- findTRECandidate(BI, CannotTailCallElimCallsMarkedTail)) {
- LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
- << "INTO UNCOND BRANCH PRED: " << *Pred);
- FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
-
- // Cleanup: if all predecessors of BB have been eliminated by
- // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
- // because the ret instruction in there is still using a value which
- // eliminateRecursiveTailCall will attempt to remove.
- if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
- DTU.deleteBB(BB);
-
- eliminateCall(CI);
- ++NumRetDuped;
- Change = true;
- }
- }
-
- return Change;
-}
-
-bool TailRecursionEliminator::processReturningBlock(
- ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
- CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
- if (!CI)
- return false;
-
- return eliminateCall(CI);
-}
-
void TailRecursionEliminator::cleanupAndFinalize() {
// If we eliminated any tail recursions, it's possible that we inserted some
// silly PHI nodes which just merge an initial value (the incoming operand)
@@ -801,6 +747,50 @@ void TailRecursionEliminator::cleanupAndFinalize() {
}
}
+bool TailRecursionEliminator::processBlock(
+ BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) {
+ Instruction *TI = BB.getTerminator();
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isConditional())
+ return false;
+
+ BasicBlock *Succ = BI->getSuccessor(0);
+ ReturnInst *Ret = dyn_cast<ReturnInst>(Succ->getFirstNonPHIOrDbg(true));
+
+ if (!Ret)
+ return false;
+
+ CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+
+ if (!CI)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ
+ << "INTO UNCOND BRANCH PRED: " << BB);
+ FoldReturnIntoUncondBranch(Ret, Succ, &BB, &DTU);
+ ++NumRetDuped;
+
+ // If all predecessors of Succ have been eliminated by
+ // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
+ // because the ret instruction in there is still using a value which
+ // eliminateCall will attempt to remove. This block can only contain
+ // instructions that can't have uses, therefore it is safe to remove.
+ if (pred_empty(Succ))
+ DTU.deleteBB(Succ);
+
+ eliminateCall(CI);
+ return true;
+ } else if (isa<ReturnInst>(TI)) {
+ CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+
+ if (CI)
+ return eliminateCall(CI);
+ }
+
+ return false;
+}
+
bool TailRecursionEliminator::eliminate(Function &F,
const TargetTransformInfo *TTI,
AliasAnalysis *AA,
@@ -825,23 +815,11 @@ bool TailRecursionEliminator::eliminate(Function &F,
// TRE would deallocate variable sized allocas, TRE doesn't).
bool CanTRETailMarkedCall = canTRE(F);
+ // Change any tail recursive calls to loops.
TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
- // Change any tail recursive calls to loops.
- //
- // FIXME: The code generator produces really bad code when an 'escaping
- // alloca' is changed from being a static alloca to being a dynamic alloca.
- // Until this is resolved, disable this transformation if that would ever
- // happen. This bug is PR962.
- for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
- BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
- if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
- bool Change = TRE.processReturningBlock(Ret, !CanTRETailMarkedCall);
- if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
- Change = TRE.foldReturnAndProcessPred(Ret, !CanTRETailMarkedCall);
- MadeChange |= Change;
- }
- }
+ for (BasicBlock &BB : F)
+ MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall);
TRE.cleanupAndFinalize();
diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index 7c81e6352dec..80a7d3a43ad6 100644
--- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -48,12 +48,12 @@ static void warnAboutLeftoverTransformations(Loop *L,
if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
- Optional<int> VectorizeWidth =
- getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+ Optional<ElementCount> VectorizeWidth =
+ getOptionalElementCountLoopAttribute(L);
Optional<int> InterleaveCount =
getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
- if (VectorizeWidth.getValueOr(0) != 1)
+ if (!VectorizeWidth || VectorizeWidth->isVector())
ORE->emit(
DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
"FailedRequestedVectorization",