diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms')
154 files changed, 19960 insertions, 12172 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 4762011d63d8..0e05129b5261 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -34,8 +34,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" @@ -63,7 +66,8 @@ namespace { /// struct ArgPromotion : public CallGraphSCCPass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -81,7 +85,8 @@ namespace { bool isDenselyPacked(Type *type, const DataLayout &DL); bool canPaddingBeAccessed(Argument *Arg); CallGraphNode *PromoteArguments(CallGraphNode *CGN); - bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; + bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, + AAResults &AAR) const; CallGraphNode *DoPromotion(Function *F, SmallPtrSetImpl<Argument*> &ArgsToPromote, SmallPtrSetImpl<Argument*> &ByValArgsToTransform); @@ -90,15 +95,15 @@ namespace { bool doInitialization(CallGraph &CG) override; /// The maximum number of elements to expand, or 0 for unlimited. unsigned maxElements; - DenseMap<const Function *, DISubprogram *> FunctionDIs; }; } char ArgPromotion::ID = 0; INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) @@ -217,9 +222,9 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // First check: see if there are any pointer arguments! If not, quick exit. SmallVector<Argument*, 16> PointerArgs; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) - if (I->getType()->isPointerTy()) - PointerArgs.push_back(I); + for (Argument &I : F->args()) + if (I.getType()->isPointerTy()) + PointerArgs.push_back(&I); if (PointerArgs.empty()) return nullptr; // Second check: make sure that all callers are direct callers. We can't @@ -237,6 +242,14 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { const DataLayout &DL = F->getParent()->getDataLayout(); + // We need to manually construct BasicAA directly in order to disable its use + // of other function analyses. + BasicAAResult BAR(createLegacyPMBasicAAResult(*this, *F)); + + // Construct our own AA results for this function. We do this manually to + // work around the limitations of the legacy pass manager. + AAResults AAR(createLegacyPMAAResults(*this, *F, BAR)); + // Check to see which arguments are promotable. If an argument is promotable, // add it to ArgsToPromote. SmallPtrSet<Argument*, 8> ArgsToPromote; @@ -281,8 +294,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // If all the elements are single-value types, we can promote it. bool AllSimple = true; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - if (!STy->getElementType(i)->isSingleValueType()) { + for (const auto *EltTy : STy->elements()) { + if (!EltTy->isSingleValueType()) { AllSimple = false; break; } @@ -303,8 +316,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { if (isSelfRecursive) { if (StructType *STy = dyn_cast<StructType>(AgTy)) { bool RecursiveType = false; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - if (STy->getElementType(i) == PtrArg->getType()) { + for (const auto *EltTy : STy->elements()) { + if (EltTy == PtrArg->getType()) { RecursiveType = true; break; } @@ -315,7 +328,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { } // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr())) + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR)) ArgsToPromote.insert(PtrArg); } @@ -416,7 +429,8 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark, /// elements of the aggregate in order to avoid exploding the number of /// arguments passed in. bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, - bool isByValOrInAlloca) const { + bool isByValOrInAlloca, + AAResults &AAR) const { typedef std::set<IndicesVector> GEPIndicesSet; // Quick exit for unused arguments @@ -453,12 +467,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // First, iterate the entry block and mark loads of (geps of) arguments as // safe. - BasicBlock *EntryBlock = Arg->getParent()->begin(); + BasicBlock &EntryBlock = Arg->getParent()->front(); // Declare this here so we can reuse it IndicesVector Indices; - for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end(); - I != E; ++I) - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + for (Instruction &I : EntryBlock) + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { Value *V = LI->getPointerOperand(); if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) { V = GEP->getPointerOperand(); @@ -501,12 +514,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, if (GEP->use_empty()) { // Dead GEP's cause trouble later. Just remove them if we run into // them. - getAnalysis<AliasAnalysis>().deleteValue(GEP); GEP->eraseFromParent(); // TODO: This runs the above loop over and over again for dead GEPs // Couldn't we just do increment the UI iterator earlier and erase the // use? - return isSafeToPromoteArgument(Arg, isByValOrInAlloca); + return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR); } // Ensure that all of the indices are constants. @@ -563,8 +575,6 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // blocks we know to be transparent to the load. SmallPtrSet<BasicBlock*, 16> TranspBlocks; - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - for (unsigned i = 0, e = Loads.size(); i != e; ++i) { // Check to see if the load is invalidated from the start of the block to // the load itself. @@ -572,8 +582,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, BasicBlock *BB = Load->getParent(); MemoryLocation Loc = MemoryLocation::get(Load); - if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc, - AliasAnalysis::Mod)) + if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod)) return false; // Pointer is invalidated! // Now check every path from the entry block to the load for transparency. @@ -581,7 +590,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // loading block. for (BasicBlock *P : predecessors(BB)) { for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) - if (AA.canBasicBlockModify(*TranspBB, Loc)) + if (AAR.canBasicBlockModify(*TranspBB, Loc)) return false; } } @@ -637,13 +646,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, unsigned ArgIndex = 1; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++ArgIndex) { - if (ByValArgsToTransform.count(I)) { + if (ByValArgsToTransform.count(&*I)) { // Simple byval argument? Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); StructType *STy = cast<StructType>(AgTy); Params.insert(Params.end(), STy->element_begin(), STy->element_end()); ++NumByValArgsPromoted; - } else if (!ArgsToPromote.count(I)) { + } else if (!ArgsToPromote.count(&*I)) { // Unchanged argument Params.push_back(I->getType()); AttributeSet attrs = PAL.getParamAttributes(ArgIndex); @@ -661,7 +670,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // In this table, we will track which indices are loaded from the argument // (where direct loads are tracked as no indices). - ScalarizeTable &ArgIndices = ScalarizedElements[I]; + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; for (User *U : I->users()) { Instruction *UI = cast<Instruction>(U); Type *SrcTy; @@ -687,7 +696,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, else // Take any load, we will use it only to update Alias Analysis OrigLoad = cast<LoadInst>(UI->user_back()); - OriginalLoads[std::make_pair(I, Indices)] = OrigLoad; + OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; } // Add a parameter to the function for each element passed in. @@ -722,15 +731,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, NF->copyAttributesFrom(F); // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(F); - if (DI != FunctionDIs.end()) { - DISubprogram *SP = DI->second; - SP->replaceFunction(NF); - // Ensure the map is updated so it can be reused on subsequent argument - // promotions of the same function. - FunctionDIs.erase(DI); - FunctionDIs[NF] = SP; - } + NF->setSubprogram(F->getSubprogram()); + F->setSubprogram(nullptr); DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" << "From: " << *F); @@ -740,13 +742,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec)); AttributesVec.clear(); - F->getParent()->getFunctionList().insert(F, NF); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); - // Get the alias analysis information that we need to update to reflect our - // changes. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - // Get the callgraph information that we need to update to reflect our // changes. CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); @@ -775,7 +773,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, ArgIndex = 1; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++AI, ++ArgIndex) - if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { Args.push_back(*AI); // Unmodified argument if (CallPAL.hasAttributes(ArgIndex)) { @@ -783,7 +781,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, AttributesVec. push_back(AttributeSet::get(F->getContext(), Args.size(), B)); } - } else if (ByValArgsToTransform.count(I)) { + } else if (ByValArgsToTransform.count(&*I)) { // Emit a GEP and load for each element of the struct. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); StructType *STy = cast<StructType>(AgTy); @@ -798,14 +796,14 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } } else if (!I->use_empty()) { // Non-dead argument: insert GEPs and loads as appropriate. - ScalarizeTable &ArgIndices = ScalarizedElements[I]; + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; // Store the Value* version of the indices in here, but declare it now // for reuse. std::vector<Value*> Ops; for (ScalarizeTable::iterator SI = ArgIndices.begin(), E = ArgIndices.end(); SI != E; ++SI) { Value *V = *AI; - LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, SI->second)]; + LoadInst *OrigLoad = OriginalLoads[std::make_pair(&*I, SI->second)]; if (!SI->second.empty()) { Ops.reserve(SI->second.size()); Type *ElTy = V->getType(); @@ -873,10 +871,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Args.clear(); AttributesVec.clear(); - // Update the alias analysis implementation to know that we are replacing - // the old call with a new one. - AA.replaceWithNewValue(Call, New); - // Update the callgraph to know that the callsite has been transformed. CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN); @@ -901,20 +895,19 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), I2 = NF->arg_begin(); I != E; ++I) { - if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { // If this is an unmodified argument, move the name and users over to the // new version. - I->replaceAllUsesWith(I2); - I2->takeName(I); - AA.replaceWithNewValue(I, I2); + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); ++I2; continue; } - if (ByValArgsToTransform.count(I)) { + if (ByValArgsToTransform.count(&*I)) { // In the callee, we create an alloca, and store each of the new incoming // arguments into the alloca. - Instruction *InsertPt = NF->begin()->begin(); + Instruction *InsertPt = &NF->begin()->front(); // Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); @@ -929,13 +922,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), InsertPt); I2->setName(I->getName()+"."+Twine(i)); - new StoreInst(I2++, Idx, InsertPt); + new StoreInst(&*I2++, Idx, InsertPt); } // Anything that used the arg should now use the alloca. I->replaceAllUsesWith(TheAlloca); - TheAlloca->takeName(I); - AA.replaceWithNewValue(I, TheAlloca); + TheAlloca->takeName(&*I); // If the alloca is used in a call, we must clear the tail flag since // the callee now uses an alloca from the caller. @@ -948,23 +940,20 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, continue; } - if (I->use_empty()) { - AA.deleteValue(I); + if (I->use_empty()) continue; - } // Otherwise, if we promoted this argument, then all users are load // instructions (or GEPs with only load users), and all loads should be // using the new argument that we added. - ScalarizeTable &ArgIndices = ScalarizedElements[I]; + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; while (!I->use_empty()) { if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { assert(ArgIndices.begin()->second.empty() && "Load element should sort to front!"); I2->setName(I->getName()+".val"); - LI->replaceAllUsesWith(I2); - AA.replaceWithNewValue(LI, I2); + LI->replaceAllUsesWith(&*I2); LI->eraseFromParent(); DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() << "' in function '" << F->getName() << "'\n"); @@ -1000,11 +989,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // the argument specified by ArgNo. while (!GEP->use_empty()) { LoadInst *L = cast<LoadInst>(GEP->user_back()); - L->replaceAllUsesWith(TheArg); - AA.replaceWithNewValue(L, TheArg); + L->replaceAllUsesWith(&*TheArg); L->eraseFromParent(); } - AA.deleteValue(GEP); GEP->eraseFromParent(); } } @@ -1013,10 +1000,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, std::advance(I2, ArgIndices.size()); } - // Tell the alias analysis that the old function is about to disappear. - AA.replaceWithNewValue(F, NF); - - NF_CGN->stealCalledFunctionsFrom(CG[F]); // Now that the old function is dead, delete it. If there is a dangling @@ -1032,6 +1015,5 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } bool ArgPromotion::doInitialization(CallGraph &CG) { - FunctionDIs = makeSubprogramMap(CG.getModule()); return CallGraphSCCPass::doInitialization(CG); } diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index 8ce7646621ff..0aa49d6fde01 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -119,7 +119,7 @@ bool ConstantMerge::runOnModule(Module &M) { // First: Find the canonical constants others will be merged with. for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { - GlobalVariable *GV = GVI++; + GlobalVariable *GV = &*GVI++; // If this GV is dead, remove it. GV->removeDeadConstantUsers(); @@ -160,7 +160,7 @@ bool ConstantMerge::runOnModule(Module &M) { // invalidating the Constant* pointers in CMap. for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { - GlobalVariable *GV = GVI++; + GlobalVariable *GV = &*GVI++; // Only process constants with initializers in the default address space. if (!GV->isConstant() || !GV->hasDefinitiveInitializer() || diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp new file mode 100644 index 000000000000..5bbb7513005c --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -0,0 +1,166 @@ +//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass exports all llvm.bitset's found in the module in the form of a +// __cfi_check function, which can be used to verify cross-DSO call targets. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "cross-dso-cfi" + +STATISTIC(TypeIds, "Number of unique type identifiers"); + +namespace { + +struct CrossDSOCFI : public ModulePass { + static char ID; + CrossDSOCFI() : ModulePass(ID) { + initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry()); + } + + Module *M; + MDNode *VeryLikelyWeights; + + ConstantInt *extractBitSetTypeId(MDNode *MD); + void buildCFICheck(); + + bool doInitialization(Module &M) override; + bool runOnModule(Module &M) override; +}; + +} // anonymous namespace + +INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, + false) +INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false) +char CrossDSOCFI::ID = 0; + +ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; } + +bool CrossDSOCFI::doInitialization(Module &Mod) { + M = &Mod; + VeryLikelyWeights = + MDBuilder(M->getContext()).createBranchWeights((1U << 20) - 1, 1); + + return false; +} + +/// extractBitSetTypeId - Extracts TypeId from a hash-based bitset MDNode. +ConstantInt *CrossDSOCFI::extractBitSetTypeId(MDNode *MD) { + // This check excludes vtables for classes inside anonymous namespaces. + auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(0)); + if (!TM) + return nullptr; + auto C = dyn_cast_or_null<ConstantInt>(TM->getValue()); + if (!C) return nullptr; + // We are looking for i64 constants. + if (C->getBitWidth() != 64) return nullptr; + + // Sanity check. + auto FM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(1)); + // Can be null if a function was removed by an optimization. + if (FM) { + auto F = dyn_cast<Function>(FM->getValue()); + // But can never be a function declaration. + assert(!F || !F->isDeclaration()); + (void)F; // Suppress unused variable warning in the no-asserts build. + } + return C; +} + +/// buildCFICheck - emits __cfi_check for the current module. +void CrossDSOCFI::buildCFICheck() { + // FIXME: verify that __cfi_check ends up near the end of the code section, + // but before the jump slots created in LowerBitSets. + llvm::DenseSet<uint64_t> BitSetIds; + NamedMDNode *BitSetNM = M->getNamedMetadata("llvm.bitsets"); + + if (BitSetNM) + for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) + if (ConstantInt *TypeId = extractBitSetTypeId(BitSetNM->getOperand(I))) + BitSetIds.insert(TypeId->getZExtValue()); + + LLVMContext &Ctx = M->getContext(); + Constant *C = M->getOrInsertFunction( + "__cfi_check", + FunctionType::get( + Type::getVoidTy(Ctx), + {Type::getInt64Ty(Ctx), PointerType::getUnqual(Type::getInt8Ty(Ctx))}, + false)); + Function *F = dyn_cast<Function>(C); + F->setAlignment(4096); + auto args = F->arg_begin(); + Argument &CallSiteTypeId = *(args++); + CallSiteTypeId.setName("CallSiteTypeId"); + Argument &Addr = *(args++); + Addr.setName("Addr"); + assert(args == F->arg_end()); + + BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); + + BasicBlock *TrapBB = BasicBlock::Create(Ctx, "trap", F); + IRBuilder<> IRBTrap(TrapBB); + Function *TrapFn = Intrinsic::getDeclaration(M, Intrinsic::trap); + llvm::CallInst *TrapCall = IRBTrap.CreateCall(TrapFn); + TrapCall->setDoesNotReturn(); + TrapCall->setDoesNotThrow(); + IRBTrap.CreateUnreachable(); + + BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F); + IRBuilder<> IRBExit(ExitBB); + IRBExit.CreateRetVoid(); + + IRBuilder<> IRB(BB); + SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, BitSetIds.size()); + for (uint64_t TypeId : BitSetIds) { + ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); + BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); + IRBuilder<> IRBTest(TestBB); + Function *BitsetTestFn = + Intrinsic::getDeclaration(M, Intrinsic::bitset_test); + + Value *Test = IRBTest.CreateCall( + BitsetTestFn, {&Addr, MetadataAsValue::get( + Ctx, ConstantAsMetadata::get(CaseTypeId))}); + BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB); + BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights); + + SI->addCase(CaseTypeId, TestBB); + ++TypeIds; + } +} + +bool CrossDSOCFI::runOnModule(Module &M) { + if (M.getModuleFlag("Cross-DSO CFI") == nullptr) + return false; + buildCFICheck(); + return true; +} diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d0447640259e..4de3d95ab11d 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -35,6 +35,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <map> #include <set> #include <tuple> @@ -121,14 +122,6 @@ namespace { typedef SmallVector<RetOrArg, 5> UseVector; - // Map each LLVM function to corresponding metadata with debug info. If - // the function is replaced with another one, we should patch the pointer - // to LLVM function in metadata. - // As the code generation for module is finished (and DIBuilder is - // finalized) we assume that subprogram descriptors won't be changed, and - // they are stored in map for short duration anyway. - DenseMap<const Function *, DISubprogram *> FunctionDIs; - protected: // DAH uses this to specify a different ID. explicit DAE(char &ID) : ModulePass(ID) {} @@ -198,6 +191,13 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { if (Fn.hasAddressTaken()) return false; + // Don't touch naked functions. The assembly might be using an argument, or + // otherwise rely on the frame layout in a way that this analysis will not + // see. + if (Fn.hasFnAttribute(Attribute::Naked)) { + return false; + } + // Okay, we know we can transform this function if safe. Scan its body // looking for calls marked musttail or calls to llvm.vastart. for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { @@ -229,7 +229,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { // Create the new function body and insert it into the module... Function *NF = Function::Create(NFTy, Fn.getLinkage()); NF->copyAttributesFrom(&Fn); - Fn.getParent()->getFunctionList().insert(&Fn, NF); + Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF); NF->takeName(&Fn); // Loop over all of the callers of the function, transforming the call sites @@ -296,20 +296,12 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(), I2 = NF->arg_begin(); I != E; ++I, ++I2) { // Move the name and users over to the new version. - I->replaceAllUsesWith(I2); - I2->takeName(I); + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); } // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(&Fn); - if (DI != FunctionDIs.end()) { - DISubprogram *SP = DI->second; - SP->replaceFunction(NF); - // Ensure the map is updated so it can be reused on non-varargs argument - // eliminations of the same function. - FunctionDIs.erase(DI); - FunctionDIs[NF] = SP; - } + NF->setSubprogram(Fn.getSubprogram()); // Fix up any BlockAddresses that refer to the function. Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType())); @@ -345,16 +337,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) return false; + // Don't touch naked functions. The assembly might be using an argument, or + // otherwise rely on the frame layout in a way that this analysis will not + // see. + if (Fn.hasFnAttribute(Attribute::Naked)) + return false; + if (Fn.use_empty()) return false; SmallVector<unsigned, 8> UnusedArgs; - for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); - I != E; ++I) { - Argument *Arg = I; - - if (Arg->use_empty() && !Arg->hasByValOrInAllocaAttr()) - UnusedArgs.push_back(Arg->getArgNo()); + for (Argument &Arg : Fn.args()) { + if (Arg.use_empty() && !Arg.hasByValOrInAllocaAttr()) + UnusedArgs.push_back(Arg.getArgNo()); } if (UnusedArgs.empty()) @@ -485,6 +480,10 @@ DAE::Liveness DAE::SurveyUse(const Use *U, if (F) { // Used in a direct call. + // The function argument is live if it is used as a bundle operand. + if (CS.isBundleOperand(U)) + return Live; + // Find the argument number. We know for sure that this use is an // argument, since if it was the function argument this would be an // indirect call and the we know can't be looking at a value of the @@ -543,6 +542,14 @@ void DAE::SurveyFunction(const Function &F) { return; } + // Don't touch naked functions. The assembly might be using an argument, or + // otherwise rely on the frame layout in a way that this analysis will not + // see. + if (F.hasFnAttribute(Attribute::Naked)) { + MarkLive(F); + return; + } + unsigned RetCount = NumRetVals(&F); // Assume all return values are dead typedef SmallVector<Liveness, 5> RetVals; @@ -648,7 +655,7 @@ void DAE::SurveyFunction(const Function &F) { } else { // See what the effect of this use is (recording any uses that cause // MaybeLive in MaybeLiveArgUses). - Result = SurveyUses(AI, MaybeLiveArgUses); + Result = SurveyUses(&*AI, MaybeLiveArgUses); } // Mark the result. @@ -878,7 +885,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { NF->setAttributes(NewPAL); // Insert the new function before the old function, so we won't be processing // it again. - F->getParent()->getFunctionList().insert(F, NF); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); // Loop over all of the callers of the function, transforming the call sites @@ -946,7 +953,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { Instruction *New; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, "", Call); + Args, "", Call->getParent()); cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); cast<InvokeInst>(New)->setAttributes(NewCallPAL); } else { @@ -976,9 +983,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { " must have been a struct or an array!"); Instruction *InsertPt = Call; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - BasicBlock::iterator IP = II->getNormalDest()->begin(); - while (isa<PHINode>(IP)) ++IP; - InsertPt = IP; + BasicBlock *NewEdge = SplitEdge(New->getParent(), II->getNormalDest()); + InsertPt = &*NewEdge->getFirstInsertionPt(); } // We used to return a struct or array. Instead of doing smart stuff @@ -1026,8 +1032,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[i]) { // If this is a live argument, move the name and users over to the new // version. - I->replaceAllUsesWith(I2); - I2->takeName(I); + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); ++I2; } else { // If this argument is dead, replace any uses of it with null constants @@ -1079,9 +1085,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { } // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(F); - if (DI != FunctionDIs.end()) - DI->second->replaceFunction(NF); + NF->setSubprogram(F->getSubprogram()); // Now that the old function is dead, delete it. F->eraseFromParent(); @@ -1092,9 +1096,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { bool DAE::runOnModule(Module &M) { bool Changed = false; - // Collect debug info descriptors for functions. - FunctionDIs = makeSubprogramMap(M); - // First pass: Do a simple check to see if any functions can have their "..." // removed. We can do this if they never call va_start. This loop cannot be // fused with the next loop, because deleting a function invalidates @@ -1119,7 +1120,7 @@ bool DAE::runOnModule(Module &M) { for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { // Increment now, because the function will probably get removed (ie. // replaced by a new one). - Function *F = I++; + Function *F = &*I++; Changed |= RemoveDeadStuffFromFunction(F); } diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index 67ba72d6a360..af313a6b001d 100644 --- a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -1,4 +1,5 @@ -//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===// +//===-- ElimAvailExtern.cpp - DCE unreachable internal functions +//----------------===// // // The LLVM Compiler Infrastructure // @@ -15,9 +16,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Pass.h" using namespace llvm; @@ -28,18 +27,18 @@ STATISTIC(NumFunctions, "Number of functions removed"); STATISTIC(NumVariables, "Number of global variables removed"); namespace { - struct EliminateAvailableExternally : public ModulePass { - static char ID; // Pass identification, replacement for typeid - EliminateAvailableExternally() : ModulePass(ID) { - initializeEliminateAvailableExternallyPass( - *PassRegistry::getPassRegistry()); - } +struct EliminateAvailableExternally : public ModulePass { + static char ID; // Pass identification, replacement for typeid + EliminateAvailableExternally() : ModulePass(ID) { + initializeEliminateAvailableExternallyPass( + *PassRegistry::getPassRegistry()); + } - // run - Do the EliminateAvailableExternally pass on the specified module, - // optionally updating the specified callgraph to reflect the changes. - // - bool runOnModule(Module &M) override; - }; + // run - Do the EliminateAvailableExternally pass on the specified module, + // optionally updating the specified callgraph to reflect the changes. + // + bool runOnModule(Module &M) override; +}; } char EliminateAvailableExternally::ID = 0; @@ -54,30 +53,31 @@ bool EliminateAvailableExternally::runOnModule(Module &M) { bool Changed = false; // Drop initializers of available externally global variables. - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { - if (!I->hasAvailableExternallyLinkage()) + for (GlobalVariable &GV : M.globals()) { + if (!GV.hasAvailableExternallyLinkage()) continue; - if (I->hasInitializer()) { - Constant *Init = I->getInitializer(); - I->setInitializer(nullptr); + if (GV.hasInitializer()) { + Constant *Init = GV.getInitializer(); + GV.setInitializer(nullptr); if (isSafeToDestroyConstant(Init)) Init->destroyConstant(); } - I->removeDeadConstantUsers(); - I->setLinkage(GlobalValue::ExternalLinkage); + GV.removeDeadConstantUsers(); + GV.setLinkage(GlobalValue::ExternalLinkage); NumVariables++; + Changed = true; } // Drop the bodies of available externally functions. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (!I->hasAvailableExternallyLinkage()) + for (Function &F : M) { + if (!F.hasAvailableExternallyLinkage()) continue; - if (!I->isDeclaration()) + if (!F.isDeclaration()) // This will set the linkage to external - I->deleteBody(); - I->removeDeadConstantUsers(); + F.deleteBody(); + F.removeDeadConstantUsers(); NumFunctions++; + Changed = true; } return Changed; diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index b9462f2ffc72..1a3b9253d72f 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -83,7 +83,7 @@ namespace { for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { bool Delete = - deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); + deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration(); if (!Delete) { if (I->hasAvailableExternallyLinkage()) continue; @@ -103,7 +103,7 @@ namespace { // Visit the Functions. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { bool Delete = - deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); + deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration(); if (!Delete) { if (I->hasAvailableExternallyLinkage()) continue; @@ -124,7 +124,7 @@ namespace { Module::alias_iterator CurI = I; ++I; - bool Delete = deleteStuff == (bool)Named.count(CurI); + bool Delete = deleteStuff == (bool)Named.count(&*CurI); makeVisible(*CurI, Delete); if (Delete) { @@ -143,7 +143,7 @@ namespace { } CurI->replaceAllUsesWith(Declaration); - delete CurI; + delete &*CurI; } } diff --git a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp new file mode 100644 index 000000000000..816291dac9e8 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -0,0 +1,121 @@ +//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "forceattrs" + +static cl::list<std::string> + ForceAttributes("force-attribute", cl::Hidden, + cl::desc("Add an attribute to a function. This should be a " + "pair of 'function-name:attribute-name', for " + "example -force-add-attribute=foo:noinline. This " + "option can be specified multiple times.")); + +static Attribute::AttrKind parseAttrKind(StringRef Kind) { + return StringSwitch<Attribute::AttrKind>(Kind) + .Case("alwaysinline", Attribute::AlwaysInline) + .Case("builtin", Attribute::Builtin) + .Case("cold", Attribute::Cold) + .Case("convergent", Attribute::Convergent) + .Case("inlinehint", Attribute::InlineHint) + .Case("jumptable", Attribute::JumpTable) + .Case("minsize", Attribute::MinSize) + .Case("naked", Attribute::Naked) + .Case("nobuiltin", Attribute::NoBuiltin) + .Case("noduplicate", Attribute::NoDuplicate) + .Case("noimplicitfloat", Attribute::NoImplicitFloat) + .Case("noinline", Attribute::NoInline) + .Case("nonlazybind", Attribute::NonLazyBind) + .Case("noredzone", Attribute::NoRedZone) + .Case("noreturn", Attribute::NoReturn) + .Case("norecurse", Attribute::NoRecurse) + .Case("nounwind", Attribute::NoUnwind) + .Case("optnone", Attribute::OptimizeNone) + .Case("optsize", Attribute::OptimizeForSize) + .Case("readnone", Attribute::ReadNone) + .Case("readonly", Attribute::ReadOnly) + .Case("argmemonly", Attribute::ArgMemOnly) + .Case("returns_twice", Attribute::ReturnsTwice) + .Case("safestack", Attribute::SafeStack) + .Case("sanitize_address", Attribute::SanitizeAddress) + .Case("sanitize_memory", Attribute::SanitizeMemory) + .Case("sanitize_thread", Attribute::SanitizeThread) + .Case("ssp", Attribute::StackProtect) + .Case("sspreq", Attribute::StackProtectReq) + .Case("sspstrong", Attribute::StackProtectStrong) + .Case("uwtable", Attribute::UWTable) + .Default(Attribute::None); +} + +/// If F has any forced attributes given on the command line, add them. +static void addForcedAttributes(Function &F) { + for (auto &S : ForceAttributes) { + auto KV = StringRef(S).split(':'); + if (KV.first != F.getName()) + continue; + + auto Kind = parseAttrKind(KV.second); + if (Kind == Attribute::None) { + DEBUG(dbgs() << "ForcedAttribute: " << KV.second + << " unknown or not handled!\n"); + continue; + } + if (F.hasFnAttribute(Kind)) + continue; + F.addFnAttr(Kind); + } +} + +PreservedAnalyses ForceFunctionAttrsPass::run(Module &M) { + if (ForceAttributes.empty()) + return PreservedAnalyses::all(); + + for (Function &F : M.functions()) + addForcedAttributes(F); + + // Just conservatively invalidate analyses, this isn't likely to be important. + return PreservedAnalyses::none(); +} + +namespace { +struct ForceFunctionAttrsLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + ForceFunctionAttrsLegacyPass() : ModulePass(ID) { + initializeForceFunctionAttrsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (ForceAttributes.empty()) + return false; + + for (Function &F : M.functions()) + addForcedAttributes(F); + + // Conservatively assume we changed something. + return true; + } +}; +} + +char ForceFunctionAttrsLegacyPass::ID = 0; +INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs", + "Force set function attributes", false, false) + +Pass *llvm::createForceFunctionAttrsLegacyPass() { + return new ForceFunctionAttrsLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index bb5e64aef338..6dcfb3f83004 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -23,14 +23,21 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; @@ -42,230 +49,191 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); STATISTIC(NumNoAlias, "Number of function returns marked noalias"); -STATISTIC(NumAnnotated, "Number of attributes added to library functions"); +STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); +STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); namespace { - struct FunctionAttrs : public CallGraphSCCPass { - static char ID; // Pass identification, replacement for typeid - FunctionAttrs() : CallGraphSCCPass(ID), AA(nullptr) { - initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); - } - - // runOnSCC - Analyze the SCC, performing the transformation if possible. - bool runOnSCC(CallGraphSCC &SCC) override; - - // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. - bool AddReadAttrs(const CallGraphSCC &SCC); - - // AddArgumentAttrs - Deduce nocapture attributes for the SCC. - bool AddArgumentAttrs(const CallGraphSCC &SCC); - - // IsFunctionMallocLike - Does this function allocate new memory? - bool IsFunctionMallocLike(Function *F, - SmallPtrSet<Function*, 8> &) const; - - // AddNoAliasAttrs - Deduce noalias attributes for the SCC. - bool AddNoAliasAttrs(const CallGraphSCC &SCC); - - // Utility methods used by inferPrototypeAttributes to add attributes - // and maintain annotation statistics. - - void setDoesNotAccessMemory(Function &F) { - if (!F.doesNotAccessMemory()) { - F.setDoesNotAccessMemory(); - ++NumAnnotated; - } - } - - void setOnlyReadsMemory(Function &F) { - if (!F.onlyReadsMemory()) { - F.setOnlyReadsMemory(); - ++NumAnnotated; - } - } - - void setDoesNotThrow(Function &F) { - if (!F.doesNotThrow()) { - F.setDoesNotThrow(); - ++NumAnnotated; - } - } - - void setDoesNotCapture(Function &F, unsigned n) { - if (!F.doesNotCapture(n)) { - F.setDoesNotCapture(n); - ++NumAnnotated; - } - } - - void setOnlyReadsMemory(Function &F, unsigned n) { - if (!F.onlyReadsMemory(n)) { - F.setOnlyReadsMemory(n); - ++NumAnnotated; - } - } - - void setDoesNotAlias(Function &F, unsigned n) { - if (!F.doesNotAlias(n)) { - F.setDoesNotAlias(n); - ++NumAnnotated; - } - } - - // inferPrototypeAttributes - Analyze the name and prototype of the - // given function and set any applicable attributes. Returns true - // if any attributes were set and false otherwise. - bool inferPrototypeAttributes(Function &F); +typedef SmallSetVector<Function *, 8> SCCNodeSet; +} - // annotateLibraryCalls - Adds attributes to well-known standard library - // call declarations. - bool annotateLibraryCalls(const CallGraphSCC &SCC); +namespace { +struct FunctionAttrs : public CallGraphSCCPass { + static char ID; // Pass identification, replacement for typeid + FunctionAttrs() : CallGraphSCCPass(ID) { + initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); + } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - CallGraphSCCPass::getAnalysisUsage(AU); - } + bool runOnSCC(CallGraphSCC &SCC) override; + bool doInitialization(CallGraph &CG) override { + Revisit.clear(); + return false; + } + bool doFinalization(CallGraph &CG) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + CallGraphSCCPass::getAnalysisUsage(AU); + } - private: - AliasAnalysis *AA; - TargetLibraryInfo *TLI; - }; +private: + TargetLibraryInfo *TLI; + SmallVector<WeakVH,16> Revisit; +}; } char FunctionAttrs::ID = 0; INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", - "Deduce function attributes", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) + "Deduce function attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", - "Deduce function attributes", false, false) + "Deduce function attributes", false, false) Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); } +namespace { +/// The three kinds of memory access relevant to 'readonly' and +/// 'readnone' attributes. +enum MemoryAccessKind { + MAK_ReadNone = 0, + MAK_ReadOnly = 1, + MAK_MayWrite = 2 +}; +} -/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC. -bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { - SmallPtrSet<Function*, 8> SCCNodes; - - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) - SCCNodes.insert((*I)->getFunction()); +static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, + const SCCNodeSet &SCCNodes) { + FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F); + if (MRB == FMRB_DoesNotAccessMemory) + // Already perfect! + return MAK_ReadNone; + + // Definitions with weak linkage may be overridden at linktime with + // something that writes memory, so treat them like declarations. + if (F.isDeclaration() || F.mayBeOverridden()) { + if (AliasAnalysis::onlyReadsMemory(MRB)) + return MAK_ReadOnly; + + // Conservatively assume it writes to memory. + return MAK_MayWrite; + } - // Check if any of the functions in the SCC read or write memory. If they - // write memory then they can't be marked readnone or readonly. + // Scan the function body for instructions that may read or write memory. bool ReadsMemory = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - - if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) - // External node or node we don't want to optimize - assume it may write - // memory and give up. - return false; + for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { + Instruction *I = &*II; + + // Some instructions can be ignored even if they read or write memory. + // Detect these now, skipping to the next instruction if one is found. + CallSite CS(cast<Value>(I)); + if (CS) { + // Ignore calls to functions in the same SCC. + if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) + continue; + FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS); - AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F); - if (MRB == AliasAnalysis::DoesNotAccessMemory) - // Already perfect! - continue; + // If the call doesn't access memory, we're done. + if (!(MRB & MRI_ModRef)) + continue; - // Definitions with weak linkage may be overridden at linktime with - // something that writes memory, so treat them like declarations. - if (F->isDeclaration() || F->mayBeOverridden()) { - if (!AliasAnalysis::onlyReadsMemory(MRB)) - // May write memory. Just give up. - return false; + if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { + // The call could access any memory. If that includes writes, give up. + if (MRB & MRI_Mod) + return MAK_MayWrite; + // If it reads, note it. + if (MRB & MRI_Ref) + ReadsMemory = true; + continue; + } - ReadsMemory = true; - continue; - } + // Check whether all pointer arguments point to local memory, and + // ignore calls that only access local memory. + for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); + CI != CE; ++CI) { + Value *Arg = *CI; + if (!Arg->getType()->isPtrOrPtrVectorTy()) + continue; - // Scan the function body for instructions that may read or write memory. - for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { - Instruction *I = &*II; + AAMDNodes AAInfo; + I->getAAMetadata(AAInfo); + MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); - // Some instructions can be ignored even if they read or write memory. - // Detect these now, skipping to the next instruction if one is found. - CallSite CS(cast<Value>(I)); - if (CS) { - // Ignore calls to functions in the same SCC. - if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) + // Skip accesses to local or constant memory as they don't impact the + // externally visible mod/ref behavior. + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) continue; - AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(CS); - // If the call doesn't access arbitrary memory, we may be able to - // figure out something. - if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { - // If the call does access argument pointees, check each argument. - if (AliasAnalysis::doesAccessArgPointees(MRB)) - // Check whether all pointer arguments point to local memory, and - // ignore calls that only access local memory. - for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); - CI != CE; ++CI) { - Value *Arg = *CI; - if (Arg->getType()->isPointerTy()) { - AAMDNodes AAInfo; - I->getAAMetadata(AAInfo); - - MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); - if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) { - if (MRB & AliasAnalysis::Mod) - // Writes non-local memory. Give up. - return false; - if (MRB & AliasAnalysis::Ref) - // Ok, it reads non-local memory. - ReadsMemory = true; - } - } - } - continue; - } - // The call could access any memory. If that includes writes, give up. - if (MRB & AliasAnalysis::Mod) - return false; - // If it reads, note it. - if (MRB & AliasAnalysis::Ref) + + if (MRB & MRI_Mod) + // Writes non-local memory. Give up. + return MAK_MayWrite; + if (MRB & MRI_Ref) + // Ok, it reads non-local memory. ReadsMemory = true; - continue; - } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - // Ignore non-volatile loads from local memory. (Atomic is okay here.) - if (!LI->isVolatile()) { - MemoryLocation Loc = MemoryLocation::get(LI); - if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) - continue; - } - } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - // Ignore non-volatile stores to local memory. (Atomic is okay here.) - if (!SI->isVolatile()) { - MemoryLocation Loc = MemoryLocation::get(SI); - if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) - continue; - } - } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { - // Ignore vaargs on local memory. - MemoryLocation Loc = MemoryLocation::get(VI); - if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) + } + continue; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + // Ignore non-volatile loads from local memory. (Atomic is okay here.) + if (!LI->isVolatile()) { + MemoryLocation Loc = MemoryLocation::get(LI); + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Ignore non-volatile stores to local memory. (Atomic is okay here.) + if (!SI->isVolatile()) { + MemoryLocation Loc = MemoryLocation::get(SI); + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) continue; } + } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { + // Ignore vaargs on local memory. + MemoryLocation Loc = MemoryLocation::get(VI); + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } - // Any remaining instructions need to be taken seriously! Check if they - // read or write memory. - if (I->mayWriteToMemory()) - // Writes memory. Just give up. - return false; + // Any remaining instructions need to be taken seriously! Check if they + // read or write memory. + if (I->mayWriteToMemory()) + // Writes memory. Just give up. + return MAK_MayWrite; + + // If this instruction may read memory, remember that. + ReadsMemory |= I->mayReadFromMemory(); + } + + return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone; +} - // If this instruction may read memory, remember that. - ReadsMemory |= I->mayReadFromMemory(); +/// Deduce readonly/readnone attributes for the SCC. +template <typename AARGetterT> +static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { + // Check if any of the functions in the SCC read or write memory. If they + // write memory then they can't be marked readnone or readonly. + bool ReadsMemory = false; + for (Function *F : SCCNodes) { + // Call the callable parameter to look up AA results for this function. + AAResults &AAR = AARGetter(*F); + + switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) { + case MAK_MayWrite: + return false; + case MAK_ReadOnly: + ReadsMemory = true; + break; + case MAK_ReadNone: + // Nothing to do! + break; } } // Success! Functions in this SCC do not access memory, or only read memory. // Give them the appropriate attribute. bool MadeChange = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - + for (Function *F : SCCNodes) { if (F->doesNotAccessMemory()) // Already perfect! continue; @@ -278,11 +246,10 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { // Clear out any existing attributes. AttrBuilder B; - B.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); - F->removeAttributes(AttributeSet::FunctionIndex, - AttributeSet::get(F->getContext(), - AttributeSet::FunctionIndex, B)); + B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); + F->removeAttributes( + AttributeSet::FunctionIndex, + AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B)); // Add in the new attribute. F->addAttribute(AttributeSet::FunctionIndex, @@ -298,124 +265,140 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { } namespace { - // For a given pointer Argument, this retains a list of Arguments of functions - // in the same SCC that the pointer data flows into. We use this to build an - // SCC of the arguments. - struct ArgumentGraphNode { - Argument *Definition; - SmallVector<ArgumentGraphNode*, 4> Uses; - }; - - class ArgumentGraph { - // We store pointers to ArgumentGraphNode objects, so it's important that - // that they not move around upon insert. - typedef std::map<Argument*, ArgumentGraphNode> ArgumentMapTy; +/// For a given pointer Argument, this retains a list of Arguments of functions +/// in the same SCC that the pointer data flows into. We use this to build an +/// SCC of the arguments. +struct ArgumentGraphNode { + Argument *Definition; + SmallVector<ArgumentGraphNode *, 4> Uses; +}; + +class ArgumentGraph { + // We store pointers to ArgumentGraphNode objects, so it's important that + // that they not move around upon insert. + typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy; + + ArgumentMapTy ArgumentMap; + + // There is no root node for the argument graph, in fact: + // void f(int *x, int *y) { if (...) f(x, y); } + // is an example where the graph is disconnected. The SCCIterator requires a + // single entry point, so we maintain a fake ("synthetic") root node that + // uses every node. Because the graph is directed and nothing points into + // the root, it will not participate in any SCCs (except for its own). + ArgumentGraphNode SyntheticRoot; + +public: + ArgumentGraph() { SyntheticRoot.Definition = nullptr; } + + typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator; + + iterator begin() { return SyntheticRoot.Uses.begin(); } + iterator end() { return SyntheticRoot.Uses.end(); } + ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } + + ArgumentGraphNode *operator[](Argument *A) { + ArgumentGraphNode &Node = ArgumentMap[A]; + Node.Definition = A; + SyntheticRoot.Uses.push_back(&Node); + return &Node; + } +}; - ArgumentMapTy ArgumentMap; +/// This tracker checks whether callees are in the SCC, and if so it does not +/// consider that a capture, instead adding it to the "Uses" list and +/// continuing with the analysis. +struct ArgumentUsesTracker : public CaptureTracker { + ArgumentUsesTracker(const SCCNodeSet &SCCNodes) + : Captured(false), SCCNodes(SCCNodes) {} - // There is no root node for the argument graph, in fact: - // void f(int *x, int *y) { if (...) f(x, y); } - // is an example where the graph is disconnected. The SCCIterator requires a - // single entry point, so we maintain a fake ("synthetic") root node that - // uses every node. Because the graph is directed and nothing points into - // the root, it will not participate in any SCCs (except for its own). - ArgumentGraphNode SyntheticRoot; + void tooManyUses() override { Captured = true; } - public: - ArgumentGraph() { SyntheticRoot.Definition = nullptr; } + bool captured(const Use *U) override { + CallSite CS(U->getUser()); + if (!CS.getInstruction()) { + Captured = true; + return true; + } - typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator; + Function *F = CS.getCalledFunction(); + if (!F || F->isDeclaration() || F->mayBeOverridden() || + !SCCNodes.count(F)) { + Captured = true; + return true; + } - iterator begin() { return SyntheticRoot.Uses.begin(); } - iterator end() { return SyntheticRoot.Uses.end(); } - ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } + // Note: the callee and the two successor blocks *follow* the argument + // operands. This means there is no need to adjust UseIndex to account for + // these. - ArgumentGraphNode *operator[](Argument *A) { - ArgumentGraphNode &Node = ArgumentMap[A]; - Node.Definition = A; - SyntheticRoot.Uses.push_back(&Node); - return &Node; - } - }; + unsigned UseIndex = + std::distance(const_cast<const Use *>(CS.arg_begin()), U); - // This tracker checks whether callees are in the SCC, and if so it does not - // consider that a capture, instead adding it to the "Uses" list and - // continuing with the analysis. - struct ArgumentUsesTracker : public CaptureTracker { - ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes) - : Captured(false), SCCNodes(SCCNodes) {} + assert(UseIndex < CS.data_operands_size() && + "Indirect function calls should have been filtered above!"); - void tooManyUses() override { Captured = true; } + if (UseIndex >= CS.getNumArgOperands()) { + // Data operand, but not a argument operand -- must be a bundle operand + assert(CS.hasOperandBundles() && "Must be!"); - bool captured(const Use *U) override { - CallSite CS(U->getUser()); - if (!CS.getInstruction()) { Captured = true; return true; } + // CaptureTracking told us that we're being captured by an operand bundle + // use. In this case it does not matter if the callee is within our SCC + // or not -- we've been captured in some unknown way, and we have to be + // conservative. + Captured = true; + return true; + } - Function *F = CS.getCalledFunction(); - if (!F || !SCCNodes.count(F)) { Captured = true; return true; } - - bool Found = false; - Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end(); - PI != PE; ++PI, ++AI) { - if (AI == AE) { - assert(F->isVarArg() && "More params than args in non-varargs call"); - Captured = true; - return true; - } - if (PI == U) { - Uses.push_back(AI); - Found = true; - break; - } - } - assert(Found && "Capturing call-site captured nothing?"); - (void)Found; - return false; + if (UseIndex >= F->arg_size()) { + assert(F->isVarArg() && "More params than args in non-varargs call"); + Captured = true; + return true; } - bool Captured; // True only if certainly captured (used outside our SCC). - SmallVector<Argument*, 4> Uses; // Uses within our SCC. + Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); + return false; + } - const SmallPtrSet<Function*, 8> &SCCNodes; - }; + bool Captured; // True only if certainly captured (used outside our SCC). + SmallVector<Argument *, 4> Uses; // Uses within our SCC. + + const SCCNodeSet &SCCNodes; +}; } namespace llvm { - template<> struct GraphTraits<ArgumentGraphNode*> { - typedef ArgumentGraphNode NodeType; - typedef SmallVectorImpl<ArgumentGraphNode*>::iterator ChildIteratorType; +template <> struct GraphTraits<ArgumentGraphNode *> { + typedef ArgumentGraphNode NodeType; + typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType; - static inline NodeType *getEntryNode(NodeType *A) { return A; } - static inline ChildIteratorType child_begin(NodeType *N) { - return N->Uses.begin(); - } - static inline ChildIteratorType child_end(NodeType *N) { - return N->Uses.end(); - } - }; - template<> struct GraphTraits<ArgumentGraph*> - : public GraphTraits<ArgumentGraphNode*> { - static NodeType *getEntryNode(ArgumentGraph *AG) { - return AG->getEntryNode(); - } - static ChildIteratorType nodes_begin(ArgumentGraph *AG) { - return AG->begin(); - } - static ChildIteratorType nodes_end(ArgumentGraph *AG) { - return AG->end(); - } - }; + static inline NodeType *getEntryNode(NodeType *A) { return A; } + static inline ChildIteratorType child_begin(NodeType *N) { + return N->Uses.begin(); + } + static inline ChildIteratorType child_end(NodeType *N) { + return N->Uses.end(); + } +}; +template <> +struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { + static NodeType *getEntryNode(ArgumentGraph *AG) { + return AG->getEntryNode(); + } + static ChildIteratorType nodes_begin(ArgumentGraph *AG) { + return AG->begin(); + } + static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); } +}; } -// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. +/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. static Attribute::AttrKind determinePointerReadAttrs(Argument *A, - const SmallPtrSet<Argument*, 8> &SCCNodes) { - - SmallVector<Use*, 32> Worklist; - SmallSet<Use*, 32> Visited; - int Count = 0; + const SmallPtrSet<Argument *, 8> &SCCNodes) { + + SmallVector<Use *, 32> Worklist; + SmallSet<Use *, 32> Visited; // inalloca arguments are always clobbered by the call. if (A->hasInAllocaAttr()) @@ -425,9 +408,6 @@ determinePointerReadAttrs(Argument *A, // We don't need to track IsWritten. If A is written to, return immediately. for (Use &U : A->uses()) { - if (Count++ >= 20) - return Attribute::None; - Visited.insert(&U); Worklist.push_back(&U); } @@ -435,7 +415,6 @@ determinePointerReadAttrs(Argument *A, while (!Worklist.empty()) { Use *U = Worklist.pop_back_val(); Instruction *I = cast<Instruction>(U->getUser()); - Value *V = U->get(); switch (I->getOpcode()) { case Instruction::BitCast: @@ -479,24 +458,44 @@ determinePointerReadAttrs(Argument *A, return Attribute::None; } - Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); - for (CallSite::arg_iterator A = B; A != E; ++A, ++AI) { - if (A->get() == V) { - if (AI == AE) { - assert(F->isVarArg() && - "More params than args in non-varargs call."); - return Attribute::None; - } - Captures &= !CS.doesNotCapture(A - B); - if (SCCNodes.count(AI)) - continue; - if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B)) - return Attribute::None; - if (!CS.doesNotAccessMemory(A - B)) - IsRead = true; - } + // Note: the callee and the two successor blocks *follow* the argument + // operands. This means there is no need to adjust UseIndex to account + // for these. + + unsigned UseIndex = std::distance(CS.arg_begin(), U); + + // U cannot be the callee operand use: since we're exploring the + // transitive uses of an Argument, having such a use be a callee would + // imply the CallSite is an indirect call or invoke; and we'd take the + // early exit above. + assert(UseIndex < CS.data_operands_size() && + "Data operand use expected!"); + + bool IsOperandBundleUse = UseIndex >= CS.getNumArgOperands(); + + if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { + assert(F->isVarArg() && "More params than args in non-varargs call"); + return Attribute::None; } + + Captures &= !CS.doesNotCapture(UseIndex); + + // Since the optimizer (by design) cannot see the data flow corresponding + // to a operand bundle use, these cannot participate in the optimistic SCC + // analysis. Instead, we model the operand bundle uses as arguments in + // call to a function external to the SCC. + if (!SCCNodes.count(&*std::next(F->arg_begin(), UseIndex)) || + IsOperandBundleUse) { + + // The accessors used on CallSite here do the right thing for calls and + // invokes with operand bundles. + + if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(UseIndex)) + return Attribute::None; + if (!CS.doesNotAccessMemory(UseIndex)) + IsRead = true; + } + AddUsersToWorklistIfCapturing(); break; } @@ -517,21 +516,10 @@ determinePointerReadAttrs(Argument *A, return IsRead ? Attribute::ReadOnly : Attribute::ReadNone; } -/// AddArgumentAttrs - Deduce nocapture attributes for the SCC. -bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { +/// Deduce nocapture attributes for the SCC. +static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { bool Changed = false; - SmallPtrSet<Function*, 8> SCCNodes; - - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - if (F && !F->isDeclaration() && !F->mayBeOverridden() && - !F->hasFnAttribute(Attribute::OptimizeNone)) - SCCNodes.insert(F); - } - ArgumentGraph AG; AttrBuilder B; @@ -539,14 +527,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // Check each function in turn, determining which pointer arguments are not // captured. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - - if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) - // External node or function we're trying not to optimize - only a problem - // for arguments that we pass to it. - continue; - + for (Function *F : SCCNodes) { // Definitions with weak linkage may be overridden at linktime with // something that captures pointers, so treat them like declarations. if (F->isDeclaration() || F->mayBeOverridden()) @@ -556,8 +537,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // a value can't capture arguments. Don't analyze them. if (F->onlyReadsMemory() && F->doesNotThrow() && F->getReturnType()->isVoidTy()) { - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); - A != E; ++A) { + for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; + ++A) { if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; @@ -567,26 +548,30 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { continue; } - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); - A != E; ++A) { - if (!A->getType()->isPointerTy()) continue; + for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; + ++A) { + if (!A->getType()->isPointerTy()) + continue; bool HasNonLocalUses = false; if (!A->hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); - PointerMayBeCaptured(A, &Tracker); + PointerMayBeCaptured(&*A, &Tracker); if (!Tracker.Captured) { if (Tracker.Uses.empty()) { // If it's trivially not captured, mark it nocapture now. - A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B)); + A->addAttr( + AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. - ArgumentGraphNode *Node = AG[A]; - for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(), - UE = Tracker.Uses.end(); UI != UE; ++UI) { + ArgumentGraphNode *Node = AG[&*A]; + for (SmallVectorImpl<Argument *>::iterator + UI = Tracker.Uses.begin(), + UE = Tracker.Uses.end(); + UI != UE; ++UI) { Node->Uses.push_back(AG[*UI]); if (*UI != A) HasNonLocalUses = true; @@ -600,9 +585,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // Note that we don't allow any calls at all here, or else our result // will be dependent on the iteration order through the functions in the // SCC. - SmallPtrSet<Argument*, 8> Self; - Self.insert(A); - Attribute::AttrKind R = determinePointerReadAttrs(A, Self); + SmallPtrSet<Argument *, 8> Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); if (R != Attribute::None) { AttrBuilder B; B.addAttribute(R); @@ -621,10 +606,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // made. If the definition doesn't have a 'nocapture' attribute by now, it // captures. - for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG); !I.isAtEnd(); ++I) { + for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) { const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I; if (ArgumentSCC.size() == 1) { - if (!ArgumentSCC[0]->Definition) continue; // synthetic root node + if (!ArgumentSCC[0]->Definition) + continue; // synthetic root node // eg. "void f(int* x) { if (...) f(x); }" if (ArgumentSCC[0]->Uses.size() == 1 && @@ -646,9 +632,10 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { SCCCaptured = true; } } - if (SCCCaptured) continue; + if (SCCCaptured) + continue; - SmallPtrSet<Argument*, 8> ArgumentSCCNodes; + SmallPtrSet<Argument *, 8> ArgumentSCCNodes; // Fill ArgumentSCCNodes with the elements of the ArgumentSCC. Used for // quickly looking up whether a given Argument is in this ArgumentSCC. for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) { @@ -658,8 +645,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) { ArgumentGraphNode *N = *I; - for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(), - UE = N->Uses.end(); UI != UE; ++UI) { + for (SmallVectorImpl<ArgumentGraphNode *>::iterator UI = N->Uses.begin(), + UE = N->Uses.end(); + UI != UE; ++UI) { Argument *A = (*UI)->Definition; if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) continue; @@ -667,7 +655,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { break; } } - if (SCCCaptured) continue; + if (SCCCaptured) + continue; for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; @@ -704,8 +693,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { if (ReadAttr != Attribute::None) { AttrBuilder B, R; B.addAttribute(ReadAttr); - R.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); + R.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; // Clear out existing readonly/readnone attributes @@ -720,10 +708,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { return Changed; } -/// IsFunctionMallocLike - A function is malloc-like if it returns either null -/// or a pointer that doesn't alias any other pointer visible to the caller. -bool FunctionAttrs::IsFunctionMallocLike(Function *F, - SmallPtrSet<Function*, 8> &SCCNodes) const { +/// Tests whether a function is "malloc-like". +/// +/// A function is "malloc-like" if it returns either null or a pointer that +/// doesn't alias any other pointer visible to the caller. +static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { SmallSetVector<Value *, 8> FlowsToReturn; for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator())) @@ -744,39 +733,38 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, if (Instruction *RVI = dyn_cast<Instruction>(RetVal)) switch (RVI->getOpcode()) { - // Extend the analysis by looking upwards. - case Instruction::BitCast: - case Instruction::GetElementPtr: - case Instruction::AddrSpaceCast: - FlowsToReturn.insert(RVI->getOperand(0)); - continue; - case Instruction::Select: { - SelectInst *SI = cast<SelectInst>(RVI); - FlowsToReturn.insert(SI->getTrueValue()); - FlowsToReturn.insert(SI->getFalseValue()); - continue; - } - case Instruction::PHI: { - PHINode *PN = cast<PHINode>(RVI); - for (Value *IncValue : PN->incoming_values()) - FlowsToReturn.insert(IncValue); - continue; - } + // Extend the analysis by looking upwards. + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::AddrSpaceCast: + FlowsToReturn.insert(RVI->getOperand(0)); + continue; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(RVI); + FlowsToReturn.insert(SI->getTrueValue()); + FlowsToReturn.insert(SI->getFalseValue()); + continue; + } + case Instruction::PHI: { + PHINode *PN = cast<PHINode>(RVI); + for (Value *IncValue : PN->incoming_values()) + FlowsToReturn.insert(IncValue); + continue; + } - // Check whether the pointer came from an allocation. - case Instruction::Alloca: + // Check whether the pointer came from an allocation. + case Instruction::Alloca: + break; + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(RVI); + if (CS.paramHasAttr(0, Attribute::NoAlias)) + break; + if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) break; - case Instruction::Call: - case Instruction::Invoke: { - CallSite CS(RVI); - if (CS.paramHasAttr(0, Attribute::NoAlias)) - break; - if (CS.getCalledFunction() && - SCCNodes.count(CS.getCalledFunction())) - break; - } // fall-through - default: - return false; // Did not come from an allocation. + } // fall-through + default: + return false; // Did not come from an allocation. } if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) @@ -786,24 +774,11 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, return true; } -/// AddNoAliasAttrs - Deduce noalias attributes for the SCC. -bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { - SmallPtrSet<Function*, 8> SCCNodes; - - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) - SCCNodes.insert((*I)->getFunction()); - +/// Deduce noalias attributes for the SCC. +static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { // Check each function in turn, determining which functions return noalias // pointers. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - - if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) - // External node or node we don't want to optimize - skip it; - return false; - + for (Function *F : SCCNodes) { // Already noalias. if (F->doesNotAlias(0)) continue; @@ -813,18 +788,17 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { if (F->isDeclaration() || F->mayBeOverridden()) return false; - // We annotate noalias return values, which are only applicable to + // We annotate noalias return values, which are only applicable to // pointer types. if (!F->getReturnType()->isPointerTy()) continue; - if (!IsFunctionMallocLike(F, SCCNodes)) + if (!isFunctionMallocLike(F, SCCNodes)) return false; } bool MadeChange = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); + for (Function *F : SCCNodes) { if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy()) continue; @@ -836,880 +810,249 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { return MadeChange; } -/// inferPrototypeAttributes - Analyze the name and prototype of the -/// given function and set any applicable attributes. Returns true -/// if any attributes were set and false otherwise. -bool FunctionAttrs::inferPrototypeAttributes(Function &F) { - if (F.hasFnAttribute(Attribute::OptimizeNone)) - return false; +/// Tests whether this function is known to not return null. +/// +/// Requires that the function returns a pointer. +/// +/// Returns true if it believes the function will not return a null, and sets +/// \p Speculative based on whether the returned conclusion is a speculative +/// conclusion due to SCC calls. +static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, + const TargetLibraryInfo &TLI, bool &Speculative) { + assert(F->getReturnType()->isPointerTy() && + "nonnull only meaningful on pointer types"); + Speculative = false; - FunctionType *FTy = F.getFunctionType(); - LibFunc::Func TheLibFunc; - if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc))) - return false; + SmallSetVector<Value *, 8> FlowsToReturn; + for (BasicBlock &BB : *F) + if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) + FlowsToReturn.insert(Ret->getReturnValue()); - switch (TheLibFunc) { - case LibFunc::strlen: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::strchr: - case LibFunc::strrchr: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isIntegerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - break; - case LibFunc::strtol: - case LibFunc::strtod: - case LibFunc::strtof: - case LibFunc::strtoul: - case LibFunc::strtoll: - case LibFunc::strtold: - case LibFunc::strtoull: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::strcpy: - case LibFunc::stpcpy: - case LibFunc::strcat: - case LibFunc::strncat: - case LibFunc::strncpy: - case LibFunc::stpncpy: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::strxfrm: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::strcmp: //0,1 - case LibFunc::strspn: // 0,1 - case LibFunc::strncmp: // 0,1 - case LibFunc::strcspn: //0,1 - case LibFunc::strcoll: //0,1 - case LibFunc::strcasecmp: // 0,1 - case LibFunc::strncasecmp: // - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::strstr: - case LibFunc::strpbrk: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::strtok: - case LibFunc::strtok_r: - if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::scanf: - if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::setbuf: - case LibFunc::setvbuf: - if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::strdup: - case LibFunc::strndup: - if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::stat: - case LibFunc::statvfs: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::sscanf: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::sprintf: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::snprintf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 3); - setOnlyReadsMemory(F, 3); - break; - case LibFunc::setitimer: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setDoesNotCapture(F, 3); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::system: - if (FTy->getNumParams() != 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - // May throw; "system" is a valid pthread cancellation point. - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::malloc: - if (FTy->getNumParams() != 1 || - !FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::memcmp: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::memchr: - case LibFunc::memrchr: - if (FTy->getNumParams() != 3) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - break; - case LibFunc::modf: - case LibFunc::modff: - case LibFunc::modfl: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::memcpy: - case LibFunc::memccpy: - case LibFunc::memmove: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::memalign: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotAlias(F, 0); - break; - case LibFunc::mkdir: - if (FTy->getNumParams() == 0 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::mktime: - if (FTy->getNumParams() == 0 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::realloc: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - break; - case LibFunc::read: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "read" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - break; - case LibFunc::rewind: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::rmdir: - case LibFunc::remove: - case LibFunc::realpath: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::rename: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::readlink: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::write: - if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "write" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::bcopy: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::bcmp: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setOnlyReadsMemory(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::bzero: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::calloc: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::chmod: - case LibFunc::chown: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::ctermid: - case LibFunc::clearerr: - case LibFunc::closedir: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::atoi: - case LibFunc::atol: - case LibFunc::atof: - case LibFunc::atoll: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setOnlyReadsMemory(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::access: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::fopen: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fdopen: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::feof: - case LibFunc::free: - case LibFunc::fseek: - case LibFunc::ftell: - case LibFunc::fgetc: - case LibFunc::fseeko: - case LibFunc::ftello: - case LibFunc::fileno: - case LibFunc::fflush: - case LibFunc::fclose: - case LibFunc::fsetpos: - case LibFunc::flockfile: - case LibFunc::funlockfile: - case LibFunc::ftrylockfile: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::ferror: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F); - break; - case LibFunc::fputc: - case LibFunc::fstat: - case LibFunc::frexp: - case LibFunc::frexpf: - case LibFunc::frexpl: - case LibFunc::fstatvfs: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::fgets: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 3); - break; - case LibFunc::fread: - if (FTy->getNumParams() != 4 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(3)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 4); - break; - case LibFunc::fwrite: - if (FTy->getNumParams() != 4 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(3)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 4); - break; - case LibFunc::fputs: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::fscanf: - case LibFunc::fprintf: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fgetpos: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::getc: - case LibFunc::getlogin_r: - case LibFunc::getc_unlocked: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::getenv: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setOnlyReadsMemory(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::gets: - case LibFunc::getchar: - setDoesNotThrow(F); - break; - case LibFunc::getitimer: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::getpwnam: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::ungetc: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::uname: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::unlink: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::unsetenv: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::utime: - case LibFunc::utimes: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::putc: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::puts: - case LibFunc::printf: - case LibFunc::perror: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::pread: - if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "pread" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - break; - case LibFunc::pwrite: - if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "pwrite" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::putchar: - setDoesNotThrow(F); - break; - case LibFunc::popen: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::pclose: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::vscanf: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::vsscanf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::vfscanf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::valloc: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::vprintf: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::vfprintf: - case LibFunc::vsprintf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::vsnprintf: - if (FTy->getNumParams() != 4 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 3); - setOnlyReadsMemory(F, 3); - break; - case LibFunc::open: - if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - // May throw; "open" is a valid pthread cancellation point. - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::opendir: - if (FTy->getNumParams() != 1 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::tmpfile: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::times: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::htonl: - case LibFunc::htons: - case LibFunc::ntohl: - case LibFunc::ntohs: - setDoesNotThrow(F); - setDoesNotAccessMemory(F); - break; - case LibFunc::lstat: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::lchown: - if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::qsort: - if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) - return false; - // May throw; places call through function pointer. - setDoesNotCapture(F, 4); - break; - case LibFunc::dunder_strdup: - case LibFunc::dunder_strndup: - if (FTy->getNumParams() < 1 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::dunder_strtok_r: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::under_IO_getc: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::under_IO_putc: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::dunder_isoc99_scanf: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::stat64: - case LibFunc::lstat64: - case LibFunc::statvfs64: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::dunder_isoc99_sscanf: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fopen64: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fseeko64: - case LibFunc::ftello64: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::tmpfile64: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::fstat64: - case LibFunc::fstatvfs64: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::open64: - if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { + Value *RetVal = FlowsToReturn[i]; + + // If this value is locally known to be non-null, we're good + if (isKnownNonNull(RetVal, &TLI)) + continue; + + // Otherwise, we need to look upwards since we can't make any local + // conclusions. + Instruction *RVI = dyn_cast<Instruction>(RetVal); + if (!RVI) return false; - // May throw; "open" is a valid pthread cancellation point. - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::gettimeofday: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) + switch (RVI->getOpcode()) { + // Extend the analysis by looking upwards. + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::AddrSpaceCast: + FlowsToReturn.insert(RVI->getOperand(0)); + continue; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(RVI); + FlowsToReturn.insert(SI->getTrueValue()); + FlowsToReturn.insert(SI->getFalseValue()); + continue; + } + case Instruction::PHI: { + PHINode *PN = cast<PHINode>(RVI); + for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + FlowsToReturn.insert(PN->getIncomingValue(i)); + continue; + } + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(RVI); + Function *Callee = CS.getCalledFunction(); + // A call to a node within the SCC is assumed to return null until + // proven otherwise + if (Callee && SCCNodes.count(Callee)) { + Speculative = true; + continue; + } return false; - // Currently some platforms have the restrict keyword on the arguments to - // gettimeofday. To be conservative, do not add noalias to gettimeofday's - // arguments. - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - default: - // Didn't mark any attributes. - return false; + } + default: + return false; // Unknown source, may be null + }; + llvm_unreachable("should have either continued or returned"); } return true; } -/// annotateLibraryCalls - Adds attributes to well-known standard library -/// call declarations. -bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) { +/// Deduce nonnull attributes for the SCC. +static bool addNonNullAttrs(const SCCNodeSet &SCCNodes, + const TargetLibraryInfo &TLI) { + // Speculative that all functions in the SCC return only nonnull + // pointers. We may refute this as we analyze functions. + bool SCCReturnsNonNull = true; + bool MadeChange = false; - // Check each function in turn annotating well-known library function - // declarations with attributes. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); + // Check each function in turn, determining which functions return nonnull + // pointers. + for (Function *F : SCCNodes) { + // Already nonnull. + if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + Attribute::NonNull)) + continue; + + // Definitions with weak linkage may be overridden at linktime, so + // treat them like declarations. + if (F->isDeclaration() || F->mayBeOverridden()) + return false; + + // We annotate nonnull return values, which are only applicable to + // pointer types. + if (!F->getReturnType()->isPointerTy()) + continue; - if (F && F->isDeclaration()) - MadeChange |= inferPrototypeAttributes(*F); + bool Speculative = false; + if (isReturnNonNull(F, SCCNodes, TLI, Speculative)) { + if (!Speculative) { + // Mark the function eagerly since we may discover a function + // which prevents us from speculating about the entire SCC + DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n"); + F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + ++NumNonNullReturn; + MadeChange = true; + } + continue; + } + // At least one function returns something which could be null, can't + // speculate any more. + SCCReturnsNonNull = false; + } + + if (SCCReturnsNonNull) { + for (Function *F : SCCNodes) { + if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + Attribute::NonNull) || + !F->getReturnType()->isPointerTy()) + continue; + + DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); + F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + ++NumNonNullReturn; + MadeChange = true; + } } return MadeChange; } +static bool setDoesNotRecurse(Function &F) { + if (F.doesNotRecurse()) + return false; + F.setDoesNotRecurse(); + ++NumNoRecurse; + return true; +} + +static bool addNoRecurseAttrs(const CallGraphSCC &SCC, + SmallVectorImpl<WeakVH> &Revisit) { + // Try and identify functions that do not recurse. + + // If the SCC contains multiple nodes we know for sure there is recursion. + if (!SCC.isSingular()) + return false; + + const CallGraphNode *CGN = *SCC.begin(); + Function *F = CGN->getFunction(); + if (!F || F->isDeclaration() || F->doesNotRecurse()) + return false; + + // If all of the calls in F are identifiable and are to norecurse functions, F + // is norecurse. This check also detects self-recursion as F is not currently + // marked norecurse, so any called from F to F will not be marked norecurse. + if (std::all_of(CGN->begin(), CGN->end(), + [](const CallGraphNode::CallRecord &CR) { + Function *F = CR.second->getFunction(); + return F && F->doesNotRecurse(); + })) + // Function calls a potentially recursive function. + return setDoesNotRecurse(*F); + + // We know that F is not obviously recursive, but we haven't been able to + // prove that it doesn't actually recurse. Add it to the Revisit list to try + // again top-down later. + Revisit.push_back(F); + return false; +} + +static bool addNoRecurseAttrsTopDownOnly(Function *F) { + // If F is internal and all uses are in norecurse functions, then F is also + // norecurse. + if (F->doesNotRecurse()) + return false; + if (F->hasInternalLinkage()) { + for (auto *U : F->users()) + if (auto *I = dyn_cast<Instruction>(U)) { + if (!I->getParent()->getParent()->doesNotRecurse()) + return false; + } else { + return false; + } + return setDoesNotRecurse(*F); + } + return false; +} + bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { - AA = &getAnalysis<AliasAnalysis>(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + bool Changed = false; - bool Changed = annotateLibraryCalls(SCC); - Changed |= AddReadAttrs(SCC); - Changed |= AddArgumentAttrs(SCC); - Changed |= AddNoAliasAttrs(SCC); + // We compute dedicated AA results for each function in the SCC as needed. We + // use a lambda referencing external objects so that they live long enough to + // be queried, but we re-use them each time. + Optional<BasicAAResult> BAR; + Optional<AAResults> AAR; + auto AARGetter = [&](Function &F) -> AAResults & { + BAR.emplace(createLegacyPMBasicAAResult(*this, F)); + AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); + return *AAR; + }; + + // Fill SCCNodes with the elements of the SCC. Used for quickly looking up + // whether a given CallGraphNode is in this SCC. Also track whether there are + // any external or opt-none nodes that will prevent us from optimizing any + // part of the SCC. + SCCNodeSet SCCNodes; + bool ExternalNode = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) { + // External node or function we're trying not to optimize - we both avoid + // transform them and avoid leveraging information they provide. + ExternalNode = true; + continue; + } + + SCCNodes.insert(F); + } + + Changed |= addReadAttrs(SCCNodes, AARGetter); + Changed |= addArgumentAttrs(SCCNodes); + + // If we have no external nodes participating in the SCC, we can deduce some + // more precise attributes as well. + if (!ExternalNode) { + Changed |= addNoAliasAttrs(SCCNodes); + Changed |= addNonNullAttrs(SCCNodes, *TLI); + } + + Changed |= addNoRecurseAttrs(SCC, Revisit); + return Changed; +} + +bool FunctionAttrs::doFinalization(CallGraph &CG) { + bool Changed = false; + // When iterating over SCCs we visit functions in a bottom-up fashion. Some of + // the rules we have for identifying norecurse functions work best with a + // top-down walk, so look again at all the functions we previously marked as + // worth revisiting, in top-down order. + for (auto &F : reverse(Revisit)) + if (F) + Changed |= addNoRecurseAttrsTopDownOnly(cast<Function>((Value*)F)); return Changed; } diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp new file mode 100644 index 000000000000..d8b677b966f2 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -0,0 +1,433 @@ +//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Function import based on summaries. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/FunctionImport.h" + +#include "llvm/ADT/StringSet.h" +#include "llvm/IR/AutoUpgrade.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Object/FunctionIndexObjectFile.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/SourceMgr.h" + +#include <map> + +using namespace llvm; + +#define DEBUG_TYPE "function-import" + +/// Limit on instruction count of imported functions. +static cl::opt<unsigned> ImportInstrLimit( + "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"), + cl::desc("Only import functions with less than N instructions")); + +// Load lazily a module from \p FileName in \p Context. +static std::unique_ptr<Module> loadFile(const std::string &FileName, + LLVMContext &Context) { + SMDiagnostic Err; + DEBUG(dbgs() << "Loading '" << FileName << "'\n"); + std::unique_ptr<Module> Result = getLazyIRFileModule(FileName, Err, Context); + if (!Result) { + Err.print("function-import", errs()); + return nullptr; + } + + Result->materializeMetadata(); + UpgradeDebugInfo(*Result); + + return Result; +} + +namespace { +/// Helper to load on demand a Module from file and cache it for subsequent +/// queries. It can be used with the FunctionImporter. +class ModuleLazyLoaderCache { + /// Cache of lazily loaded module for import. + StringMap<std::unique_ptr<Module>> ModuleMap; + + /// Retrieve a Module from the cache or lazily load it on demand. + std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule; + +public: + /// Create the loader, Module will be initialized in \p Context. + ModuleLazyLoaderCache(std::function< + std::unique_ptr<Module>(StringRef FileName)> createLazyModule) + : createLazyModule(createLazyModule) {} + + /// Retrieve a Module from the cache or lazily load it on demand. + Module &operator()(StringRef FileName); + + std::unique_ptr<Module> takeModule(StringRef FileName) { + auto I = ModuleMap.find(FileName); + assert(I != ModuleMap.end()); + std::unique_ptr<Module> Ret = std::move(I->second); + ModuleMap.erase(I); + return Ret; + } +}; + +// Get a Module for \p FileName from the cache, or load it lazily. +Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) { + auto &Module = ModuleMap[Identifier]; + if (!Module) + Module = createLazyModule(Identifier); + return *Module; +} +} // anonymous namespace + +/// Walk through the instructions in \p F looking for external +/// calls not already in the \p CalledFunctions set. If any are +/// found they are added to the \p Worklist for importing. +static void findExternalCalls(const Module &DestModule, Function &F, + const FunctionInfoIndex &Index, + StringSet<> &CalledFunctions, + SmallVector<StringRef, 64> &Worklist) { + // We need to suffix internal function calls imported from other modules, + // prepare the suffix ahead of time. + std::string Suffix; + if (F.getParent() != &DestModule) + Suffix = + (Twine(".llvm.") + + Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str(); + + for (auto &BB : F) { + for (auto &I : BB) { + if (isa<CallInst>(I)) { + auto CalledFunction = cast<CallInst>(I).getCalledFunction(); + // Insert any new external calls that have not already been + // added to set/worklist. + if (!CalledFunction || !CalledFunction->hasName()) + continue; + // Ignore intrinsics early + if (CalledFunction->isIntrinsic()) { + assert(CalledFunction->getIntrinsicID() != 0); + continue; + } + auto ImportedName = CalledFunction->getName(); + auto Renamed = (ImportedName + Suffix).str(); + // Rename internal functions + if (CalledFunction->hasInternalLinkage()) { + ImportedName = Renamed; + } + auto It = CalledFunctions.insert(ImportedName); + if (!It.second) { + // This is a call to a function we already considered, skip. + continue; + } + // Ignore functions already present in the destination module + auto *SrcGV = DestModule.getNamedValue(ImportedName); + if (SrcGV) { + assert(isa<Function>(SrcGV) && "Name collision during import"); + if (!cast<Function>(SrcGV)->isDeclaration()) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring " + << ImportedName << " already in DestinationModule\n"); + continue; + } + } + + Worklist.push_back(It.first->getKey()); + DEBUG(dbgs() << DestModule.getModuleIdentifier() + << ": Adding callee for : " << ImportedName << " : " + << F.getName() << "\n"); + } + } + } +} + +// Helper function: given a worklist and an index, will process all the worklist +// and decide what to import based on the summary information. +// +// Nothing is actually imported, functions are materialized in their source +// module and analyzed there. +// +// \p ModuleToFunctionsToImportMap is filled with the set of Function to import +// per Module. +static void GetImportList(Module &DestModule, + SmallVector<StringRef, 64> &Worklist, + StringSet<> &CalledFunctions, + std::map<StringRef, DenseSet<const GlobalValue *>> + &ModuleToFunctionsToImportMap, + const FunctionInfoIndex &Index, + ModuleLazyLoaderCache &ModuleLoaderCache) { + while (!Worklist.empty()) { + auto CalledFunctionName = Worklist.pop_back_val(); + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for " + << CalledFunctionName << "\n"); + + // Try to get a summary for this function call. + auto InfoList = Index.findFunctionInfoList(CalledFunctionName); + if (InfoList == Index.end()) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for " + << CalledFunctionName << " Ignoring.\n"); + continue; + } + assert(!InfoList->second.empty() && "No summary, error at import?"); + + // Comdat can have multiple entries, FIXME: what do we do with them? + auto &Info = InfoList->second[0]; + assert(Info && "Nullptr in list, error importing summaries?\n"); + + auto *Summary = Info->functionSummary(); + if (!Summary) { + // FIXME: in case we are lazyloading summaries, we can do it now. + DEBUG(dbgs() << DestModule.getModuleIdentifier() + << ": Missing summary for " << CalledFunctionName + << ", error at import?\n"); + llvm_unreachable("Missing summary"); + } + + if (Summary->instCount() > ImportInstrLimit) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of " + << CalledFunctionName << " with " << Summary->instCount() + << " instructions (limit " << ImportInstrLimit << ")\n"); + continue; + } + + // Get the module path from the summary. + auto ModuleIdentifier = Summary->modulePath(); + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing " + << CalledFunctionName << " from " << ModuleIdentifier << "\n"); + + auto &SrcModule = ModuleLoaderCache(ModuleIdentifier); + + // The function that we will import! + GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName); + + if (!SGV) { + // The destination module is referencing function using their renamed name + // when importing a function that was originally local in the source + // module. The source module we have might not have been renamed so we try + // to remove the suffix added during the renaming to recover the original + // name in the source module. + std::pair<StringRef, StringRef> Split = + CalledFunctionName.split(".llvm."); + SGV = SrcModule.getNamedValue(Split.first); + assert(SGV && "Can't find function to import in source module"); + } + if (!SGV) { + report_fatal_error(Twine("Can't load function '") + CalledFunctionName + + "' in Module '" + SrcModule.getModuleIdentifier() + + "', error in the summary?\n"); + } + + Function *F = dyn_cast<Function>(SGV); + if (!F && isa<GlobalAlias>(SGV)) { + auto *SGA = dyn_cast<GlobalAlias>(SGV); + F = dyn_cast<Function>(SGA->getBaseObject()); + CalledFunctionName = F->getName(); + } + assert(F && "Imported Function is ... not a Function"); + + // We cannot import weak_any functions/aliases without possibly affecting + // the order they are seen and selected by the linker, changing program + // semantics. + if (SGV->hasWeakAnyLinkage()) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() + << ": Ignoring import request for weak-any " + << (isa<Function>(SGV) ? "function " : "alias ") + << CalledFunctionName << " from " + << SrcModule.getModuleIdentifier() << "\n"); + continue; + } + + // Add the function to the import list + auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()]; + Entry.insert(F); + + // Process the newly imported functions and add callees to the worklist. + F->materialize(); + findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist); + } +} + +// Automatically import functions in Module \p DestModule based on the summaries +// index. +// +// The current implementation imports every called functions that exists in the +// summaries index. +bool FunctionImporter::importFunctions(Module &DestModule) { + DEBUG(dbgs() << "Starting import for Module " + << DestModule.getModuleIdentifier() << "\n"); + unsigned ImportedCount = 0; + + /// First step is collecting the called external functions. + StringSet<> CalledFunctions; + SmallVector<StringRef, 64> Worklist; + for (auto &F : DestModule) { + if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist); + } + if (Worklist.empty()) + return false; + + /// Second step: for every call to an external function, try to import it. + + // Linker that will be used for importing function + Linker TheLinker(DestModule); + + // Map of Module -> List of Function to import from the Module + std::map<StringRef, DenseSet<const GlobalValue *>> + ModuleToFunctionsToImportMap; + + // Analyze the summaries and get the list of functions to import by + // populating ModuleToFunctionsToImportMap + ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader); + GetImportList(DestModule, Worklist, CalledFunctions, + ModuleToFunctionsToImportMap, Index, ModuleLoaderCache); + assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList"); + + StringMap<std::unique_ptr<DenseMap<unsigned, MDNode *>>> + ModuleToTempMDValsMap; + + // Do the actual import of functions now, one Module at a time + for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) { + // Get the module for the import + auto &FunctionsToImport = FunctionsToImportPerModule.second; + std::unique_ptr<Module> SrcModule = + ModuleLoaderCache.takeModule(FunctionsToImportPerModule.first); + assert(&DestModule.getContext() == &SrcModule->getContext() && + "Context mismatch"); + + // Save the mapping of value ids to temporary metadata created when + // importing this function. If we have already imported from this module, + // add new temporary metadata to the existing mapping. + auto &TempMDVals = ModuleToTempMDValsMap[SrcModule->getModuleIdentifier()]; + if (!TempMDVals) + TempMDVals = llvm::make_unique<DenseMap<unsigned, MDNode *>>(); + + // Link in the specified functions. + if (TheLinker.linkInModule(std::move(SrcModule), Linker::Flags::None, + &Index, &FunctionsToImport, TempMDVals.get())) + report_fatal_error("Function Import: link error"); + + ImportedCount += FunctionsToImport.size(); + } + + // Now link in metadata for all modules from which we imported functions. + for (StringMapEntry<std::unique_ptr<DenseMap<unsigned, MDNode *>>> &SME : + ModuleToTempMDValsMap) { + // Load the specified source module. + auto &SrcModule = ModuleLoaderCache(SME.getKey()); + + // Link in all necessary metadata from this module. + if (TheLinker.linkInMetadata(SrcModule, SME.getValue().get())) + return false; + } + + DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module " + << DestModule.getModuleIdentifier() << "\n"); + return ImportedCount; +} + +/// Summary file to use for function importing when using -function-import from +/// the command line. +static cl::opt<std::string> + SummaryFile("summary-file", + cl::desc("The summary file to use for function importing.")); + +static void diagnosticHandler(const DiagnosticInfo &DI) { + raw_ostream &OS = errs(); + DiagnosticPrinterRawOStream DP(OS); + DI.print(DP); + OS << '\n'; +} + +/// Parse the function index out of an IR file and return the function +/// index object if found, or nullptr if not. +static std::unique_ptr<FunctionInfoIndex> +getFunctionIndexForFile(StringRef Path, std::string &Error, + DiagnosticHandlerFunction DiagnosticHandler) { + std::unique_ptr<MemoryBuffer> Buffer; + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = + MemoryBuffer::getFile(Path); + if (std::error_code EC = BufferOrErr.getError()) { + Error = EC.message(); + return nullptr; + } + Buffer = std::move(BufferOrErr.get()); + ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr = + object::FunctionIndexObjectFile::create(Buffer->getMemBufferRef(), + DiagnosticHandler); + if (std::error_code EC = ObjOrErr.getError()) { + Error = EC.message(); + return nullptr; + } + return (*ObjOrErr)->takeIndex(); +} + +namespace { +/// Pass that performs cross-module function import provided a summary file. +class FunctionImportPass : public ModulePass { + /// Optional function summary index to use for importing, otherwise + /// the summary-file option must be specified. + const FunctionInfoIndex *Index; + +public: + /// Pass identification, replacement for typeid + static char ID; + + /// Specify pass name for debug output + const char *getPassName() const override { + return "Function Importing"; + } + + explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr) + : ModulePass(ID), Index(Index) {} + + bool runOnModule(Module &M) override { + if (SummaryFile.empty() && !Index) + report_fatal_error("error: -function-import requires -summary-file or " + "file from frontend\n"); + std::unique_ptr<FunctionInfoIndex> IndexPtr; + if (!SummaryFile.empty()) { + if (Index) + report_fatal_error("error: -summary-file and index from frontend\n"); + std::string Error; + IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler); + if (!IndexPtr) { + errs() << "Error loading file '" << SummaryFile << "': " << Error + << "\n"; + return false; + } + Index = IndexPtr.get(); + } + + // Perform the import now. + auto ModuleLoader = [&M](StringRef Identifier) { + return loadFile(Identifier, M.getContext()); + }; + FunctionImporter Importer(*Index, ModuleLoader); + return Importer.importFunctions(M); + + return false; + } +}; +} // anonymous namespace + +char FunctionImportPass::ID = 0; +INITIALIZE_PASS_BEGIN(FunctionImportPass, "function-import", + "Summary Based Function Import", false, false) +INITIALIZE_PASS_END(FunctionImportPass, "function-import", + "Summary Based Function Import", false, false) + +namespace llvm { +Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) { + return new FunctionImportPass(Index); +} +} diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 61d0ff94a343..9b276ed28e2e 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -92,33 +92,28 @@ bool GlobalDCE::runOnModule(Module &M) { ComdatMembers.insert(std::make_pair(C, &GA)); // Loop over the module, adding globals which are obviously necessary. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Changed |= RemoveUnusedGlobalValue(*I); + for (Function &F : M) { + Changed |= RemoveUnusedGlobalValue(F); // Functions with external linkage are needed if they have a body - if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { - if (!I->isDiscardableIfUnused()) - GlobalIsNeeded(I); - } + if (!F.isDeclaration() && !F.hasAvailableExternallyLinkage()) + if (!F.isDiscardableIfUnused()) + GlobalIsNeeded(&F); } - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { - Changed |= RemoveUnusedGlobalValue(*I); + for (GlobalVariable &GV : M.globals()) { + Changed |= RemoveUnusedGlobalValue(GV); // Externally visible & appending globals are needed, if they have an // initializer. - if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { - if (!I->isDiscardableIfUnused()) - GlobalIsNeeded(I); - } + if (!GV.isDeclaration() && !GV.hasAvailableExternallyLinkage()) + if (!GV.isDiscardableIfUnused()) + GlobalIsNeeded(&GV); } - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E; ++I) { - Changed |= RemoveUnusedGlobalValue(*I); + for (GlobalAlias &GA : M.aliases()) { + Changed |= RemoveUnusedGlobalValue(GA); // Externally visible aliases are needed. - if (!I->isDiscardableIfUnused()) { - GlobalIsNeeded(I); - } + if (!GA.isDiscardableIfUnused()) + GlobalIsNeeded(&GA); } // Now that all globals which are needed are in the AliveGlobals set, we loop @@ -126,52 +121,50 @@ bool GlobalDCE::runOnModule(Module &M) { // // The first pass is to drop initializers of global variables which are dead. - std::vector<GlobalVariable*> DeadGlobalVars; // Keep track of dead globals - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - if (!AliveGlobals.count(I)) { - DeadGlobalVars.push_back(I); // Keep track of dead globals - if (I->hasInitializer()) { - Constant *Init = I->getInitializer(); - I->setInitializer(nullptr); + std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals + for (GlobalVariable &GV : M.globals()) + if (!AliveGlobals.count(&GV)) { + DeadGlobalVars.push_back(&GV); // Keep track of dead globals + if (GV.hasInitializer()) { + Constant *Init = GV.getInitializer(); + GV.setInitializer(nullptr); if (isSafeToDestroyConstant(Init)) Init->destroyConstant(); } } // The second pass drops the bodies of functions which are dead... - std::vector<Function*> DeadFunctions; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) - if (!AliveGlobals.count(I)) { - DeadFunctions.push_back(I); // Keep track of dead globals - if (!I->isDeclaration()) - I->deleteBody(); + std::vector<Function *> DeadFunctions; + for (Function &F : M) + if (!AliveGlobals.count(&F)) { + DeadFunctions.push_back(&F); // Keep track of dead globals + if (!F.isDeclaration()) + F.deleteBody(); } // The third pass drops targets of aliases which are dead... std::vector<GlobalAlias*> DeadAliases; - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; - ++I) - if (!AliveGlobals.count(I)) { - DeadAliases.push_back(I); - I->setAliasee(nullptr); + for (GlobalAlias &GA : M.aliases()) + if (!AliveGlobals.count(&GA)) { + DeadAliases.push_back(&GA); + GA.setAliasee(nullptr); } if (!DeadFunctions.empty()) { // Now that all interferences have been dropped, delete the actual objects // themselves. - for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) { - RemoveUnusedGlobalValue(*DeadFunctions[i]); - M.getFunctionList().erase(DeadFunctions[i]); + for (Function *F : DeadFunctions) { + RemoveUnusedGlobalValue(*F); + M.getFunctionList().erase(F); } NumFunctions += DeadFunctions.size(); Changed = true; } if (!DeadGlobalVars.empty()) { - for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) { - RemoveUnusedGlobalValue(*DeadGlobalVars[i]); - M.getGlobalList().erase(DeadGlobalVars[i]); + for (GlobalVariable *GV : DeadGlobalVars) { + RemoveUnusedGlobalValue(*GV); + M.getGlobalList().erase(GV); } NumVariables += DeadGlobalVars.size(); Changed = true; @@ -179,9 +172,9 @@ bool GlobalDCE::runOnModule(Module &M) { // Now delete any dead aliases. if (!DeadAliases.empty()) { - for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) { - RemoveUnusedGlobalValue(*DeadAliases[i]); - M.getAliasList().erase(DeadAliases[i]); + for (GlobalAlias *GA : DeadAliases) { + RemoveUnusedGlobalValue(*GA); + M.getAliasList().erase(GA); } NumAliases += DeadAliases.size(); Changed = true; @@ -222,21 +215,15 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { // any globals used will be marked as needed. Function *F = cast<Function>(G); - if (F->hasPrefixData()) - MarkUsedGlobalsAsNeeded(F->getPrefixData()); - - if (F->hasPrologueData()) - MarkUsedGlobalsAsNeeded(F->getPrologueData()); + for (Use &U : F->operands()) + MarkUsedGlobalsAsNeeded(cast<Constant>(U.get())); - if (F->hasPersonalityFn()) - MarkUsedGlobalsAsNeeded(F->getPersonalityFn()); - - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) - if (GlobalValue *GV = dyn_cast<GlobalValue>(*U)) + for (BasicBlock &BB : *F) + for (Instruction &I : BB) + for (Use &U : I.operands()) + if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) GlobalIsNeeded(GV); - else if (Constant *C = dyn_cast<Constant>(*U)) + else if (Constant *C = dyn_cast<Constant>(U)) MarkUsedGlobalsAsNeeded(C); } } @@ -247,9 +234,9 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) { // Loop over all of the operands of the constant, adding any globals they // use to the list of needed globals. - for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) { + for (Use &U : C->operands()) { // If we've already processed this constant there's no need to do it again. - Constant *Op = dyn_cast<Constant>(*I); + Constant *Op = dyn_cast<Constant>(U); if (Op && SeenConstants.insert(Op).second) MarkUsedGlobalsAsNeeded(Op); } @@ -262,7 +249,8 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) { // might make it deader. // bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) { - if (GV.use_empty()) return false; + if (GV.use_empty()) + return false; GV.removeDeadConstantUsers(); return GV.use_empty(); } diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 5ffe15dbd31d..fd7736905fe8 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -54,7 +55,6 @@ STATISTIC(NumSRA , "Number of aggregate globals broken into scalars"); STATISTIC(NumHeapSRA , "Number of heap objects SRA'd"); STATISTIC(NumSubstitute,"Number of globals with initializers stored into them"); STATISTIC(NumDeleted , "Number of globals deleted"); -STATISTIC(NumFnDeleted , "Number of functions deleted"); STATISTIC(NumGlobUses , "Number of global uses devirtualized"); STATISTIC(NumLocalized , "Number of globals localized"); STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans"); @@ -69,6 +69,7 @@ namespace { struct GlobalOpt : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); } static char ID; // Pass identification, replacement for typeid GlobalOpt() : ModulePass(ID) { @@ -81,11 +82,14 @@ namespace { bool OptimizeFunctions(Module &M); bool OptimizeGlobalVars(Module &M); bool OptimizeGlobalAliases(Module &M); - bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); - bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, - const GlobalStatus &GS); + bool deleteIfDead(GlobalValue &GV); + bool processGlobal(GlobalValue &GV); + bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); + bool isPointerValueDeadOnEntryToFunction(const Function *F, + GlobalValue *GV); + TargetLibraryInfo *TLI; SmallSet<const Comdat *, 8> NotDiscardableComdats; }; @@ -95,13 +99,14 @@ char GlobalOpt::ID = 0; INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } -/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker -/// as a root? If so, we might not really want to eliminate the stores to it. +/// Is this global variable possibly used by a leak checker as a root? If so, +/// we might not really want to eliminate the stores to it. static bool isLeakCheckerRoot(GlobalVariable *GV) { // A global variable is a root if it is a pointer, or could plausibly contain // a pointer. There are two challenges; one is that we could have a struct @@ -176,10 +181,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { } while (1); } -/// CleanupPointerRootUsers - This GV is a pointer root. Loop over all users -/// of the global and clean up any that obviously don't assign the global a -/// value that isn't dynamically allocated. -/// +/// This GV is a pointer root. Loop over all users of the global and clean up +/// any that obviously don't assign the global a value that isn't dynamically +/// allocated. static bool CleanupPointerRootUsers(GlobalVariable *GV, const TargetLibraryInfo *TLI) { // A brief explanation of leak checkers. The goal is to find bugs where @@ -263,10 +267,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, return Changed; } -/// CleanupConstantGlobalUsers - We just marked GV constant. Loop over all -/// users of the global, cleaning up the obvious ones. This is largely just a -/// quick scan over the use list to clean up the easy and obvious cruft. This -/// returns true if it made a change. +/// We just marked GV constant. Loop over all users of the global, cleaning up +/// the obvious ones. This is largely just a quick scan over the use list to +/// clean up the easy and obvious cruft. This returns true if it made a change. static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, const DataLayout &DL, TargetLibraryInfo *TLI) { @@ -353,8 +356,8 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, return Changed; } -/// isSafeSROAElementUse - Return true if the specified instruction is a safe -/// user of a derived expression from a global that we want to SROA. +/// Return true if the specified instruction is a safe user of a derived +/// expression from a global that we want to SROA. static bool isSafeSROAElementUse(Value *V) { // We might have a dead and dangling constant hanging off of here. if (Constant *C = dyn_cast<Constant>(V)) @@ -385,9 +388,8 @@ static bool isSafeSROAElementUse(Value *V) { } -/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value. -/// Look at it and its uses and decide whether it is safe to SROA this global. -/// +/// U is a direct user of the specified global value. Look at it and its uses +/// and decide whether it is safe to SROA this global. static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { // The user of the global must be a GEP Inst or a ConstantExpr GEP. if (!isa<GetElementPtrInst>(U) && @@ -452,9 +454,8 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { return true; } -/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it -/// is safe for us to perform this transformation. -/// +/// Look at all uses of the global and decide whether it is safe for us to +/// perform this transformation. static bool GlobalUsersSafeToSRA(GlobalValue *GV) { for (User *U : GV->users()) if (!IsUserOfGlobalSafeForSRA(U, GV)) @@ -464,10 +465,10 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { } -/// SRAGlobal - Perform scalar replacement of aggregates on the specified global -/// variable. This opens the door for other optimizations by exposing the -/// behavior of the program in a more fine-grained way. We have determined that -/// this transformation is safe already. We return the first global variable we +/// Perform scalar replacement of aggregates on the specified global variable. +/// This opens the door for other optimizations by exposing the behavior of the +/// program in a more fine-grained way. We have determined that this +/// transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Make sure this global only has simple uses that we can SRA. @@ -497,7 +498,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { In, GV->getName()+"."+Twine(i), GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - Globals.insert(GV, NGV); + NGV->setExternallyInitialized(GV->isExternallyInitialized()); + Globals.push_back(NGV); NewGlobals.push_back(NGV); // Calculate the known alignment of the field. If the original aggregate @@ -530,7 +532,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { In, GV->getName()+"."+Twine(i), GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - Globals.insert(GV, NGV); + NGV->setExternallyInitialized(GV->isExternallyInitialized()); + Globals.push_back(NGV); NewGlobals.push_back(NGV); // Calculate the known alignment of the field. If the original aggregate @@ -545,7 +548,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { if (NewGlobals.empty()) return nullptr; - DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV); + DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); @@ -610,9 +613,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr; } -/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified -/// value will trap if the value is dynamically null. PHIs keeps track of any -/// phi nodes we've seen to avoid reprocessing them. +/// Return true if all users of the specified value will trap if the value is +/// dynamically null. PHIs keeps track of any phi nodes we've seen to avoid +/// reprocessing them. static bool AllUsesOfValueWillTrapIfNull(const Value *V, SmallPtrSetImpl<const PHINode*> &PHIs) { for (const User *U : V->users()) @@ -653,9 +656,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, return true; } -/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads -/// from GV will trap if the loaded value is null. Note that this also permits -/// comparisons of the loaded value against null, as a special case. +/// Return true if all uses of any loads from GV will trap if the loaded value +/// is null. Note that this also permits comparisons of the loaded value +/// against null, as a special case. static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { for (const User *U : GV->users()) if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { @@ -735,10 +738,10 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { } -/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null -/// value stored into it. If there are uses of the loaded value that would trap -/// if the loaded value is dynamically null, then we know that they cannot be -/// reachable with a null optimize away the load. +/// The specified global has only one non-null value stored into it. If there +/// are uses of the loaded value that would trap if the loaded value is +/// dynamically null, then we know that they cannot be reachable with a null +/// optimize away the load. static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, const DataLayout &DL, TargetLibraryInfo *TLI) { @@ -778,7 +781,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, } if (Changed) { - DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV); + DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n"); ++NumGlobUses; } @@ -801,8 +804,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, return Changed; } -/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the -/// instructions that are foldable. +/// Walk the use list of V, constant folding all of the instructions that are +/// foldable. static void ConstantPropUsersOf(Value *V, const DataLayout &DL, TargetLibraryInfo *TLI) { for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) @@ -818,11 +821,11 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL, } } -/// OptimizeGlobalAddressOfMalloc - This function takes the specified global -/// variable, and transforms the program as if it always contained the result of -/// the specified malloc. Because it is always the result of the specified -/// malloc, there is no reason to actually DO the malloc. Instead, turn the -/// malloc into a global, and any loads of GV as uses of the new global. +/// This function takes the specified global variable, and transforms the +/// program as if it always contained the result of the specified malloc. +/// Because it is always the result of the specified malloc, there is no reason +/// to actually DO the malloc. Instead, turn the malloc into a global, and any +/// loads of GV as uses of the new global. static GlobalVariable * OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, ConstantInt *NElements, const DataLayout &DL, @@ -838,13 +841,10 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, // Create the new global variable. The contents of the malloc'd memory is // undefined, so initialize with an undef value. - GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(), - GlobalType, false, - GlobalValue::InternalLinkage, - UndefValue::get(GlobalType), - GV->getName()+".body", - GV, - GV->getThreadLocalMode()); + GlobalVariable *NewGV = new GlobalVariable( + *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, + UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, + GV->getThreadLocalMode()); // If there are bitcast users of the malloc (which is typical, usually we have // a malloc + bitcast) then replace them with uses of the new global. Update @@ -935,7 +935,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, cast<StoreInst>(InitBool->user_back())->eraseFromParent(); delete InitBool; } else - GV->getParent()->getGlobalList().insert(GV, InitBool); + GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool); // Now the GV is dead, nuke it and the malloc.. GV->eraseFromParent(); @@ -951,10 +951,9 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, return NewGV; } -/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking -/// to make sure that there are no complex uses of V. We permit simple things -/// like dereferencing the pointer, but not storing through the address, unless -/// it is to the specified global. +/// Scan the use-list of V checking to make sure that there are no complex uses +/// of V. We permit simple things like dereferencing the pointer, but not +/// storing through the address, unless it is to the specified global. static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, const GlobalVariable *GV, SmallPtrSetImpl<const PHINode*> &PHIs) { @@ -998,10 +997,9 @@ static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, return true; } -/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV -/// somewhere. Transform all uses of the allocation into loads from the -/// global and uses of the resultant pointer. Further, delete the store into -/// GV. This assumes that these value pass the +/// The Alloc pointer is stored into GV somewhere. Transform all uses of the +/// allocation into loads from the global and uses of the resultant pointer. +/// Further, delete the store into GV. This assumes that these value pass the /// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate. static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, GlobalVariable *GV) { @@ -1043,9 +1041,9 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, } } -/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi -/// of a load) are simple enough to perform heap SRA on. This permits GEP's -/// that index through the array and struct field, icmps of null, and PHIs. +/// Verify that all uses of V (a load, or a phi of a load) are simple enough to +/// perform heap SRA on. This permits GEP's that index through the array and +/// struct field, icmps of null, and PHIs. static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs, SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) { @@ -1096,8 +1094,8 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, } -/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from -/// GV are simple enough to perform HeapSRA, return true. +/// If all users of values loaded from GV are simple enough to perform HeapSRA, +/// return true. static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, Instruction *StoredVal) { SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; @@ -1186,8 +1184,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, return FieldVals[FieldNo] = Result; } -/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from -/// the load, rewrite the derived value to use the HeapSRoA'd load. +/// Given a load instruction and a value derived from the load, rewrite the +/// derived value to use the HeapSRoA'd load. static void RewriteHeapSROALoadUser(Instruction *LoadUser, DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { @@ -1248,10 +1246,9 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, } } -/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global. Ptr -/// is a value loaded from the global. Eliminate all uses of Ptr, making them -/// use FieldGlobals instead. All uses of loaded values satisfy -/// AllGlobalLoadUsesSimpleEnoughForHeapSRA. +/// We are performing Heap SRoA on a global. Ptr is a value loaded from the +/// global. Eliminate all uses of Ptr, making them use FieldGlobals instead. +/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA. static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { @@ -1266,8 +1263,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, } } -/// PerformHeapAllocSRoA - CI is an allocation of an array of structures. Break -/// it up into multiple allocations of arrays of the fields. +/// CI is an allocation of an array of structures. Break it up into multiple +/// allocations of arrays of the fields. static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, Value *NElems, const DataLayout &DL, const TargetLibraryInfo *TLI) { @@ -1291,12 +1288,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, Type *FieldTy = STy->getElementType(FieldNo); PointerType *PFieldTy = PointerType::get(FieldTy, AS); - GlobalVariable *NGV = - new GlobalVariable(*GV->getParent(), - PFieldTy, false, GlobalValue::InternalLinkage, - Constant::getNullValue(PFieldTy), - GV->getName() + ".f" + Twine(FieldNo), GV, - GV->getThreadLocalMode()); + GlobalVariable *NGV = new GlobalVariable( + *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage, + Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), + nullptr, GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); unsigned TypeSize = DL.getTypeAllocSize(FieldTy); @@ -1336,7 +1331,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, // Split the basic block at the old malloc. BasicBlock *OrigBB = CI->getParent(); - BasicBlock *ContBB = OrigBB->splitBasicBlock(CI, "malloc_cont"); + BasicBlock *ContBB = + OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont"); // Create the block to check the first condition. Put all these blocks at the // end of the function as they are unlikely to be executed. @@ -1376,9 +1372,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, // CI is no longer needed, remove it. CI->eraseFromParent(); - /// InsertedScalarizedLoads - As we process loads, if we can't immediately - /// update all uses of the load, keep track of what scalarized loads are - /// inserted for a given load. + /// As we process loads, if we can't immediately update all uses of the load, + /// keep track of what scalarized loads are inserted for a given load. DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues; InsertedScalarizedValues[GV] = FieldGlobals; @@ -1454,13 +1449,11 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, return cast<GlobalVariable>(FieldGlobals[0]); } -/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a -/// pointer global variable with a single value stored it that is a malloc or -/// cast of malloc. -static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, +/// This function is called when we see a pointer global variable with a single +/// value stored it that is a malloc or cast of malloc. +static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, Type *AllocTy, AtomicOrdering Ordering, - Module::global_iterator &GVI, const DataLayout &DL, TargetLibraryInfo *TLI) { // If this is a malloc of an abstract type, don't touch it. @@ -1499,7 +1492,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, // (2048 bytes currently), as we don't want to introduce a 16M global or // something. if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { - GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); + OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); return true; } @@ -1544,19 +1537,18 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, CI = cast<CallInst>(Malloc); } - GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), - DL, TLI); + PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL, + TLI); return true; } return false; } -// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge -// that only one value (besides its initializer) is ever stored to the global. -static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, +// Try to optimize globals based on the knowledge that only one value (besides +// its initializer) is ever stored to the global. +static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, AtomicOrdering Ordering, - Module::global_iterator &GVI, const DataLayout &DL, TargetLibraryInfo *TLI) { // Ignore no-op GEPs and bitcasts. @@ -1577,9 +1569,8 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, return true; } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) { Type *MallocType = getMallocAllocatedType(CI, TLI); - if (MallocType && - TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI, - DL, TLI)) + if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, + Ordering, DL, TLI)) return true; } } @@ -1587,10 +1578,10 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, return false; } -/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only -/// two values ever stored into GV are its initializer and OtherVal. See if we -/// can shrink the global into a boolean and select between the two values -/// whenever it is used. This exposes the values to other scalar optimizations. +/// At this point, we have learned that the only two values ever stored into GV +/// are its initializer and OtherVal. See if we can shrink the global into a +/// boolean and select between the two values whenever it is used. This exposes +/// the values to other scalar optimizations. static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { Type *GVElType = GV->getType()->getElementType(); @@ -1610,7 +1601,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) return false; - DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV); + DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n"); // Create the new global, initializing it to false. GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()), @@ -1620,7 +1611,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { GV->getName()+".b", GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - GV->getParent()->getGlobalList().insert(GV, NewGV); + GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV); Constant *InitVal = GV->getInitializer(); assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && @@ -1688,61 +1679,213 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { return true; } +bool GlobalOpt::deleteIfDead(GlobalValue &GV) { + GV.removeDeadConstantUsers(); -/// ProcessGlobal - Analyze the specified global variable and optimize it if -/// possible. If we make a change, return true. -bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, - Module::global_iterator &GVI) { - // Do more involved optimizations if the global is internal. - GV->removeDeadConstantUsers(); + if (!GV.isDiscardableIfUnused()) + return false; - if (GV->use_empty()) { - DEBUG(dbgs() << "GLOBAL DEAD: " << *GV); - GV->eraseFromParent(); - ++NumDeleted; - return true; - } + if (const Comdat *C = GV.getComdat()) + if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C)) + return false; - if (!GV->hasLocalLinkage()) + bool Dead; + if (auto *F = dyn_cast<Function>(&GV)) + Dead = F->isDefTriviallyDead(); + else + Dead = GV.use_empty(); + if (!Dead) + return false; + + DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n"); + GV.eraseFromParent(); + ++NumDeleted; + return true; +} + +/// Analyze the specified global variable and optimize it if possible. If we +/// make a change, return true. +bool GlobalOpt::processGlobal(GlobalValue &GV) { + // Do more involved optimizations if the global is internal. + if (!GV.hasLocalLinkage()) return false; GlobalStatus GS; - if (GlobalStatus::analyzeGlobal(GV, GS)) + if (GlobalStatus::analyzeGlobal(&GV, GS)) return false; - if (!GS.IsCompared && !GV->hasUnnamedAddr()) { - GV->setUnnamedAddr(true); + bool Changed = false; + if (!GS.IsCompared && !GV.hasUnnamedAddr()) { + GV.setUnnamedAddr(true); NumUnnamed++; + Changed = true; } - if (GV->isConstant() || !GV->hasInitializer()) + auto *GVar = dyn_cast<GlobalVariable>(&GV); + if (!GVar) + return Changed; + + if (GVar->isConstant() || !GVar->hasInitializer()) + return Changed; + + return processInternalGlobal(GVar, GS) || Changed; +} + +bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) { + // Find all uses of GV. We expect them all to be in F, and if we can't + // identify any of the uses we bail out. + // + // On each of these uses, identify if the memory that GV points to is + // used/required/live at the start of the function. If it is not, for example + // if the first thing the function does is store to the GV, the GV can + // possibly be demoted. + // + // We don't do an exhaustive search for memory operations - simply look + // through bitcasts as they're quite common and benign. + const DataLayout &DL = GV->getParent()->getDataLayout(); + SmallVector<LoadInst *, 4> Loads; + SmallVector<StoreInst *, 4> Stores; + for (auto *U : GV->users()) { + if (Operator::getOpcode(U) == Instruction::BitCast) { + for (auto *UU : U->users()) { + if (auto *LI = dyn_cast<LoadInst>(UU)) + Loads.push_back(LI); + else if (auto *SI = dyn_cast<StoreInst>(UU)) + Stores.push_back(SI); + else + return false; + } + continue; + } + + Instruction *I = dyn_cast<Instruction>(U); + if (!I) + return false; + assert(I->getParent()->getParent() == F); + + if (auto *LI = dyn_cast<LoadInst>(I)) + Loads.push_back(LI); + else if (auto *SI = dyn_cast<StoreInst>(I)) + Stores.push_back(SI); + else + return false; + } + + // We have identified all uses of GV into loads and stores. Now check if all + // of them are known not to depend on the value of the global at the function + // entry point. We do this by ensuring that every load is dominated by at + // least one store. + auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F)) + .getDomTree(); + + // The below check is quadratic. Check we're not going to do too many tests. + // FIXME: Even though this will always have worst-case quadratic time, we + // could put effort into minimizing the average time by putting stores that + // have been shown to dominate at least one load at the beginning of the + // Stores array, making subsequent dominance checks more likely to succeed + // early. + // + // The threshold here is fairly large because global->local demotion is a + // very powerful optimization should it fire. + const unsigned Threshold = 100; + if (Loads.size() * Stores.size() > Threshold) return false; - return ProcessInternalGlobal(GV, GVI, GS); + for (auto *L : Loads) { + auto *LTy = L->getType(); + if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) { + auto *STy = S->getValueOperand()->getType(); + // The load is only dominated by the store if DomTree says so + // and the number of bits loaded in L is less than or equal to + // the number of bits stored in S. + return DT.dominates(S, L) && + DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy); + })) + return false; + } + // All loads have known dependences inside F, so the global can be localized. + return true; +} + +/// C may have non-instruction users. Can all of those users be turned into +/// instructions? +static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) { + // We don't do this exhaustively. The most common pattern that we really need + // to care about is a constant GEP or constant bitcast - so just looking + // through one single ConstantExpr. + // + // The set of constants that this function returns true for must be able to be + // handled by makeAllConstantUsesInstructions. + for (auto *U : C->users()) { + if (isa<Instruction>(U)) + continue; + if (!isa<ConstantExpr>(U)) + // Non instruction, non-constantexpr user; cannot convert this. + return false; + for (auto *UU : U->users()) + if (!isa<Instruction>(UU)) + // A constantexpr used by another constant. We don't try and recurse any + // further but just bail out at this point. + return false; + } + + return true; +} + +/// C may have non-instruction users, and +/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the +/// non-instruction users to instructions. +static void makeAllConstantUsesInstructions(Constant *C) { + SmallVector<ConstantExpr*,4> Users; + for (auto *U : C->users()) { + if (isa<ConstantExpr>(U)) + Users.push_back(cast<ConstantExpr>(U)); + else + // We should never get here; allNonInstructionUsersCanBeMadeInstructions + // should not have returned true for C. + assert( + isa<Instruction>(U) && + "Can't transform non-constantexpr non-instruction to instruction!"); + } + + SmallVector<Value*,4> UUsers; + for (auto *U : Users) { + UUsers.clear(); + for (auto *UU : U->users()) + UUsers.push_back(UU); + for (auto *UU : UUsers) { + Instruction *UI = cast<Instruction>(UU); + Instruction *NewU = U->getAsInstruction(); + NewU->insertBefore(UI); + UI->replaceUsesOfWith(U, NewU); + } + U->dropAllReferences(); + } } -/// ProcessInternalGlobal - Analyze the specified global variable and optimize +/// Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. -bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, - Module::global_iterator &GVI, +bool GlobalOpt::processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS) { auto &DL = GV->getParent()->getDataLayout(); - // If this is a first class global and has only one accessing function - // and this function is main (which we know is not recursive), we replace - // the global with a local alloca in this function. + // If this is a first class global and has only one accessing function and + // this function is non-recursive, we replace the global with a local alloca + // in this function. // // NOTE: It doesn't make sense to promote non-single-value types since we // are just replacing static memory to stack memory. // // If the global is in different address space, don't bring it to stack. if (!GS.HasMultipleAccessingFunctions && - GS.AccessingFunction && !GS.HasNonInstructionUser && + GS.AccessingFunction && GV->getType()->getElementType()->isSingleValueType() && - GS.AccessingFunction->getName() == "main" && - GS.AccessingFunction->hasExternalLinkage() && - GV->getType()->getAddressSpace() == 0) { - DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV); + GV->getType()->getAddressSpace() == 0 && + !GV->isExternallyInitialized() && + allNonInstructionUsersCanBeMadeInstructions(GV) && + GS.AccessingFunction->doesNotRecurse() && + isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) { + DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction ->getEntryBlock().begin()); Type *ElemTy = GV->getType()->getElementType(); @@ -1752,6 +1895,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (!isa<UndefValue>(GV->getInitializer())) new StoreInst(GV->getInitializer(), Alloca, &FirstI); + makeAllConstantUsesInstructions(GV); + GV->replaceAllUsesWith(Alloca); GV->eraseFromParent(); ++NumLocalized; @@ -1761,7 +1906,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // If the global is never loaded (but may be stored to), it is dead. // Delete it now. if (!GS.IsLoaded) { - DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); + DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n"); bool Changed; if (isLeakCheckerRoot(GV)) { @@ -1800,11 +1945,9 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, return true; } else if (!GV->getInitializer()->getType()->isSingleValueType()) { const DataLayout &DL = GV->getParent()->getDataLayout(); - if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) { - GVI = FirstNewGV; // Don't skip the newly produced globals! + if (SRAGlobal(GV, DL)) return true; - } - } else if (GS.StoredType == GlobalStatus::StoredOnce) { + } else if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) { // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the // initializer to be the stored value, then delete all stores to the @@ -1822,8 +1965,6 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, << "simplify all users and delete global!\n"); GV->eraseFromParent(); ++NumDeleted; - } else { - GVI = GV; } ++NumSubstitute; return true; @@ -1831,8 +1972,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // Try to optimize globals based on the knowledge that only one value // (besides its initializer) is ever stored to the global. - if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI, - DL, TLI)) + if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI)) return true; // Otherwise, if the global was not a boolean, we can shrink it to be a @@ -1850,8 +1990,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, return false; } -/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified -/// function, changing them to FastCC. +/// Walk all of the direct calls of the specified function, changing them to +/// FastCC. static void ChangeCalleesToFastCall(Function *F) { for (User *U : F->users()) { if (isa<BlockAddress>(U)) @@ -1898,38 +2038,38 @@ bool GlobalOpt::OptimizeFunctions(Module &M) { bool Changed = false; // Optimize functions. for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { - Function *F = FI++; + Function *F = &*FI++; // Functions without names cannot be referenced outside this module. if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) F->setLinkage(GlobalValue::InternalLinkage); - const Comdat *C = F->getComdat(); - bool inComdat = C && NotDiscardableComdats.count(C); - F->removeDeadConstantUsers(); - if ((!inComdat || F->hasLocalLinkage()) && F->isDefTriviallyDead()) { - F->eraseFromParent(); + if (deleteIfDead(*F)) { Changed = true; - ++NumFnDeleted; - } else if (F->hasLocalLinkage()) { - if (isProfitableToMakeFastCC(F) && !F->isVarArg() && - !F->hasAddressTaken()) { - // If this function has a calling convention worth changing, is not a - // varargs function, and is only called directly, promote it to use the - // Fast calling convention. - F->setCallingConv(CallingConv::Fast); - ChangeCalleesToFastCall(F); - ++NumFastCallFns; - Changed = true; - } + continue; + } - if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && - !F->hasAddressTaken()) { - // The function is not used by a trampoline intrinsic, so it is safe - // to remove the 'nest' attribute. - RemoveNestAttribute(F); - ++NumNestRemoved; - Changed = true; - } + Changed |= processGlobal(*F); + + if (!F->hasLocalLinkage()) + continue; + if (isProfitableToMakeFastCC(F) && !F->isVarArg() && + !F->hasAddressTaken()) { + // If this function has a calling convention worth changing, is not a + // varargs function, and is only called directly, promote it to use the + // Fast calling convention. + F->setCallingConv(CallingConv::Fast); + ChangeCalleesToFastCall(F); + ++NumFastCallFns; + Changed = true; + } + + if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && + !F->hasAddressTaken()) { + // The function is not used by a trampoline intrinsic, so it is safe + // to remove the 'nest' attribute. + RemoveNestAttribute(F); + ++NumNestRemoved; + Changed = true; } } return Changed; @@ -1940,7 +2080,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { - GlobalVariable *GV = GVI++; + GlobalVariable *GV = &*GVI++; // Global variables without names cannot be referenced outside this module. if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) GV->setLinkage(GlobalValue::InternalLinkage); @@ -1953,12 +2093,12 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { GV->setInitializer(New); } - if (GV->isDiscardableIfUnused()) { - if (const Comdat *C = GV->getComdat()) - if (NotDiscardableComdats.count(C) && !GV->hasLocalLinkage()) - continue; - Changed |= ProcessGlobal(GV, GVI); + if (deleteIfDead(*GV)) { + Changed = true; + continue; } + + Changed |= processGlobal(*GV); } return Changed; } @@ -1968,8 +2108,8 @@ isSimpleEnoughValueToCommit(Constant *C, SmallPtrSetImpl<Constant *> &SimpleConstants, const DataLayout &DL); -/// isSimpleEnoughValueToCommit - Return true if the specified constant can be -/// handled by the code generator. We don't want to generate something like: +/// Return true if the specified constant can be handled by the code generator. +/// We don't want to generate something like: /// void *X = &X/42; /// because the code generator doesn't have a relocation that can handle that. /// @@ -2044,11 +2184,11 @@ isSimpleEnoughValueToCommit(Constant *C, } -/// isSimpleEnoughPointerToCommit - Return true if this constant is simple -/// enough for us to understand. In particular, if it is a cast to anything -/// other than from one pointer type to another pointer type, we punt. -/// We basically just support direct accesses to globals and GEP's of -/// globals. This should be kept up to date with CommitValueTo. +/// Return true if this constant is simple enough for us to understand. In +/// particular, if it is a cast to anything other than from one pointer type to +/// another pointer type, we punt. We basically just support direct accesses to +/// globals and GEP's of globals. This should be kept up to date with +/// CommitValueTo. static bool isSimpleEnoughPointerToCommit(Constant *C) { // Conservatively, avoid aggregate types. This is because we don't // want to worry about them partially overlapping other stores. @@ -2095,9 +2235,9 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return false; } -/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global -/// initializer. This returns 'Init' modified to reflect 'Val' stored into it. -/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into. +/// Evaluate a piece of a constantexpr store into a global initializer. This +/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the +/// GEP operands of Addr [0, OpNo) have been stepped into. static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, ConstantExpr *Addr, unsigned OpNo) { // Base case of the recursion. @@ -2144,7 +2284,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, return ConstantVector::get(Elts); } -/// CommitValueTo - We have decided that Addr (which satisfies the predicate +/// We have decided that Addr (which satisfies the predicate /// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen. static void CommitValueTo(Constant *Val, Constant *Addr) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { @@ -2160,10 +2300,10 @@ static void CommitValueTo(Constant *Val, Constant *Addr) { namespace { -/// Evaluator - This class evaluates LLVM IR, producing the Constant -/// representing each SSA instruction. Changes to global variables are stored -/// in a mapping that can be iterated over after the evaluation is complete. -/// Once an evaluation call fails, the evaluation object should not be reused. +/// This class evaluates LLVM IR, producing the Constant representing each SSA +/// instruction. Changes to global variables are stored in a mapping that can +/// be iterated over after the evaluation is complete. Once an evaluation call +/// fails, the evaluation object should not be reused. class Evaluator { public: Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI) @@ -2180,15 +2320,15 @@ public: Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType())); } - /// EvaluateFunction - Evaluate a call to function F, returning true if - /// successful, false if we can't evaluate it. ActualArgs contains the formal - /// arguments for the function. + /// Evaluate a call to function F, returning true if successful, false if we + /// can't evaluate it. ActualArgs contains the formal arguments for the + /// function. bool EvaluateFunction(Function *F, Constant *&RetVal, const SmallVectorImpl<Constant*> &ActualArgs); - /// EvaluateBlock - Evaluate all instructions in block BB, returning true if - /// successful, false if we can't evaluate it. NewBB returns the next BB that - /// control flows into, or null upon return. + /// Evaluate all instructions in block BB, returning true if successful, false + /// if we can't evaluate it. NewBB returns the next BB that control flows + /// into, or null upon return. bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB); Constant *getVal(Value *V) { @@ -2213,32 +2353,31 @@ public: private: Constant *ComputeLoadResult(Constant *P); - /// ValueStack - As we compute SSA register values, we store their contents - /// here. The back of the deque contains the current function and the stack - /// contains the values in the calling frames. + /// As we compute SSA register values, we store their contents here. The back + /// of the deque contains the current function and the stack contains the + /// values in the calling frames. std::deque<DenseMap<Value*, Constant*>> ValueStack; - /// CallStack - This is used to detect recursion. In pathological situations - /// we could hit exponential behavior, but at least there is nothing - /// unbounded. + /// This is used to detect recursion. In pathological situations we could hit + /// exponential behavior, but at least there is nothing unbounded. SmallVector<Function*, 4> CallStack; - /// MutatedMemory - For each store we execute, we update this map. Loads - /// check this to get the most up-to-date value. If evaluation is successful, - /// this state is committed to the process. + /// For each store we execute, we update this map. Loads check this to get + /// the most up-to-date value. If evaluation is successful, this state is + /// committed to the process. DenseMap<Constant*, Constant*> MutatedMemory; - /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable - /// to represent its body. This vector is needed so we can delete the - /// temporary globals when we are done. + /// To 'execute' an alloca, we create a temporary global variable to represent + /// its body. This vector is needed so we can delete the temporary globals + /// when we are done. SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps; - /// Invariants - These global variables have been marked invariant by the - /// static constructor. + /// These global variables have been marked invariant by the static + /// constructor. SmallPtrSet<GlobalVariable*, 8> Invariants; - /// SimpleConstants - These are constants we have checked and know to be - /// simple enough to live in a static initializer of a global. + /// These are constants we have checked and know to be simple enough to live + /// in a static initializer of a global. SmallPtrSet<Constant*, 8> SimpleConstants; const DataLayout &DL; @@ -2247,9 +2386,8 @@ private: } // anonymous namespace -/// ComputeLoadResult - Return the value that would be computed by a load from -/// P after the stores reflected by 'memory' have been performed. If we can't -/// decide, return null. +/// Return the value that would be computed by a load from P after the stores +/// reflected by 'memory' have been performed. If we can't decide, return null. Constant *Evaluator::ComputeLoadResult(Constant *P) { // If this memory location has been recently stored, use the stored value: it // is the most up-to-date. @@ -2275,9 +2413,9 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) { return nullptr; // don't know how to evaluate. } -/// EvaluateBlock - Evaluate all instructions in block BB, returning true if -/// successful, false if we can't evaluate it. NewBB returns the next BB that -/// control flows into, or null upon return. +/// Evaluate all instructions in block BB, returning true if successful, false +/// if we can't evaluate it. NewBB returns the next BB that control flows into, +/// or null upon return. bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB) { // This is the main evaluation loop. @@ -2438,7 +2576,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, InstResult = AllocaTmps.back().get(); DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n"); } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) { - CallSite CS(CurInst); + CallSite CS(&*CurInst); // Debug info can safely be ignored here. if (isa<DbgInfoIntrinsic>(CS.getInstruction())) { @@ -2504,6 +2642,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, // Continue even if we do nothing. ++CurInst; continue; + } else if (II->getIntrinsicID() == Intrinsic::assume) { + DEBUG(dbgs() << "Skipping assume intrinsic.\n"); + ++CurInst; + continue; } DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n"); @@ -2600,7 +2742,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult)) InstResult = ConstantFoldConstantExpression(CE, DL, TLI); - setVal(CurInst, InstResult); + setVal(&*CurInst, InstResult); } // If we just processed an invoke, we finished evaluating the block. @@ -2615,9 +2757,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, } } -/// EvaluateFunction - Evaluate a call to function F, returning true if -/// successful, false if we can't evaluate it. ActualArgs contains the formal -/// arguments for the function. +/// Evaluate a call to function F, returning true if successful, false if we +/// can't evaluate it. ActualArgs contains the formal arguments for the +/// function. bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, const SmallVectorImpl<Constant*> &ActualArgs) { // Check to see if this function is already executing (recursion). If so, @@ -2631,7 +2773,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, unsigned ArgNo = 0; for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI, ++ArgNo) - setVal(AI, ActualArgs[ArgNo]); + setVal(&*AI, ActualArgs[ArgNo]); // ExecutedBlocks - We only handle non-looping, non-recursive code. As such, // we can only evaluate any one basic block at most once. This set keeps @@ -2639,7 +2781,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, SmallPtrSet<BasicBlock*, 32> ExecutedBlocks; // CurBB - The current basic block we're evaluating. - BasicBlock *CurBB = F->begin(); + BasicBlock *CurBB = &F->front(); BasicBlock::iterator CurInst = CurBB->begin(); @@ -2679,8 +2821,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, } } -/// EvaluateStaticConstructor - Evaluate static constructors in the function, if -/// we can. Return true if we can, false otherwise. +/// Evaluate static constructors in the function, if we can. Return true if we +/// can, false otherwise. static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, const TargetLibraryInfo *TLI) { // Call the function. @@ -2708,7 +2850,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, } static int compareNames(Constant *const *A, Constant *const *B) { - return (*A)->getName().compare((*B)->getName()); + return (*A)->stripPointerCasts()->getName().compare( + (*B)->stripPointerCasts()->getName()); } static void setUsedInitializer(GlobalVariable &V, @@ -2742,7 +2885,7 @@ static void setUsedInitializer(GlobalVariable &V, } namespace { -/// \brief An easy to access representation of llvm.used and llvm.compiler.used. +/// An easy to access representation of llvm.used and llvm.compiler.used. class LLVMUsed { SmallPtrSet<GlobalValue *, 8> Used; SmallPtrSet<GlobalValue *, 8> CompilerUsed; @@ -2861,10 +3004,17 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;) { - Module::alias_iterator J = I++; + GlobalAlias *J = &*I++; + // Aliases without names cannot be referenced outside this module. if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) J->setLinkage(GlobalValue::InternalLinkage); + + if (deleteIfDead(*J)) { + Changed = true; + continue; + } + // If the aliasee may change at link time, nothing can be done - bail out. if (J->mayBeOverridden()) continue; @@ -2889,15 +3039,15 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { if (RenameTarget) { // Give the aliasee the name, linkage and other attributes of the alias. - Target->takeName(J); + Target->takeName(&*J); Target->setLinkage(J->getLinkage()); Target->setVisibility(J->getVisibility()); Target->setDLLStorageClass(J->getDLLStorageClass()); - if (Used.usedErase(J)) + if (Used.usedErase(&*J)) Used.usedInsert(Target); - if (Used.compilerUsedErase(J)) + if (Used.compilerUsedErase(&*J)) Used.compilerUsedInsert(Target); } else if (mayHaveOtherReferences(*J, Used)) continue; @@ -2936,8 +3086,8 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { return Fn; } -/// cxxDtorIsEmpty - Returns whether the given function is an empty C++ -/// destructor and can therefore be eliminated. +/// Returns whether the given function is an empty C++ destructor and can +/// therefore be eliminated. /// Note that we assume that other optimization passes have already simplified /// the code so we only look for a function with a single basic block, where /// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and @@ -3081,3 +3231,4 @@ bool GlobalOpt::runOnModule(Module &M) { return Changed; } + diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index 50f56b0f2afe..7ea6c08b2e66 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file implements the common infrastructure (including C bindings) for -// libLLVMIPO.a, which implements several transformations over the LLVM +// This file implements the common infrastructure (including C bindings) for +// libLLVMIPO.a, which implements several transformations over the LLVM // intermediate representation. // //===----------------------------------------------------------------------===// @@ -24,14 +24,17 @@ using namespace llvm; void llvm::initializeIPO(PassRegistry &Registry) { initializeArgPromotionPass(Registry); initializeConstantMergePass(Registry); + initializeCrossDSOCFIPass(Registry); initializeDAEPass(Registry); initializeDAHPass(Registry); + initializeForceFunctionAttrsLegacyPassPass(Registry); initializeFunctionAttrsPass(Registry); initializeGlobalDCEPass(Registry); initializeGlobalOptPass(Registry); initializeIPCPPass(Registry); initializeAlwaysInlinerPass(Registry); initializeSimpleInlinerPass(Registry); + initializeInferFunctionAttrsLegacyPassPass(Registry); initializeInternalizePassPass(Registry); initializeLoopExtractorPass(Registry); initializeBlockExtractorPassPass(Registry); @@ -40,13 +43,15 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeMergeFunctionsPass(Registry); initializePartialInlinerPass(Registry); initializePruneEHPass(Registry); - initializeStripDeadPrototypesPassPass(Registry); + initializeStripDeadPrototypesLegacyPassPass(Registry); initializeStripSymbolsPass(Registry); initializeStripDebugDeclarePass(Registry); initializeStripDeadDebugInfoPass(Registry); initializeStripNonDebugSymbolsPass(Registry); initializeBarrierNoopPass(Registry); initializeEliminateAvailableExternallyPass(Registry); + initializeSampleProfileLoaderPass(Registry); + initializeFunctionImportPassPass(Registry); } void LLVMInitializeIPO(LLVMPassRegistryRef R) { diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp new file mode 100644 index 000000000000..d02c861a2948 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -0,0 +1,937 @@ +//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/InferFunctionAttrs.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "inferattrs" + +STATISTIC(NumReadNone, "Number of functions inferred as readnone"); +STATISTIC(NumReadOnly, "Number of functions inferred as readonly"); +STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind"); +STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture"); +STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly"); +STATISTIC(NumNoAlias, "Number of function returns inferred as noalias"); + +static bool setDoesNotAccessMemory(Function &F) { + if (F.doesNotAccessMemory()) + return false; + F.setDoesNotAccessMemory(); + ++NumReadNone; + return true; +} + +static bool setOnlyReadsMemory(Function &F) { + if (F.onlyReadsMemory()) + return false; + F.setOnlyReadsMemory(); + ++NumReadOnly; + return true; +} + +static bool setDoesNotThrow(Function &F) { + if (F.doesNotThrow()) + return false; + F.setDoesNotThrow(); + ++NumNoUnwind; + return true; +} + +static bool setDoesNotCapture(Function &F, unsigned n) { + if (F.doesNotCapture(n)) + return false; + F.setDoesNotCapture(n); + ++NumNoCapture; + return true; +} + +static bool setOnlyReadsMemory(Function &F, unsigned n) { + if (F.onlyReadsMemory(n)) + return false; + F.setOnlyReadsMemory(n); + ++NumReadOnlyArg; + return true; +} + +static bool setDoesNotAlias(Function &F, unsigned n) { + if (F.doesNotAlias(n)) + return false; + F.setDoesNotAlias(n); + ++NumNoAlias; + return true; +} + +/// Analyze the name and prototype of the given function and set any applicable +/// attributes. +/// +/// Returns true if any attributes were set and false otherwise. +static bool inferPrototypeAttributes(Function &F, + const TargetLibraryInfo &TLI) { + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return false; + + FunctionType *FTy = F.getFunctionType(); + LibFunc::Func TheLibFunc; + if (!(TLI.getLibFunc(F.getName(), TheLibFunc) && TLI.has(TheLibFunc))) + return false; + + bool Changed = false; + + switch (TheLibFunc) { + case LibFunc::strlen: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::strchr: + case LibFunc::strrchr: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isIntegerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::strtol: + case LibFunc::strtod: + case LibFunc::strtof: + case LibFunc::strtoul: + case LibFunc::strtoll: + case LibFunc::strtold: + case LibFunc::strtoull: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::strcpy: + case LibFunc::stpcpy: + case LibFunc::strcat: + case LibFunc::strncat: + case LibFunc::strncpy: + case LibFunc::stpncpy: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::strxfrm: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::strcmp: // 0,1 + case LibFunc::strspn: // 0,1 + case LibFunc::strncmp: // 0,1 + case LibFunc::strcspn: // 0,1 + case LibFunc::strcoll: // 0,1 + case LibFunc::strcasecmp: // 0,1 + case LibFunc::strncasecmp: // + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::strstr: + case LibFunc::strpbrk: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::strtok: + case LibFunc::strtok_r: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::scanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::setbuf: + case LibFunc::setvbuf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::strdup: + case LibFunc::strndup: + if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::stat: + case LibFunc::statvfs: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::sscanf: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::sprintf: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::snprintf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 3); + Changed |= setOnlyReadsMemory(F, 3); + return Changed; + case LibFunc::setitimer: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setDoesNotCapture(F, 3); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::system: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + // May throw; "system" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::malloc: + if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::memcmp: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::memchr: + case LibFunc::memrchr: + if (FTy->getNumParams() != 3) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::modf: + case LibFunc::modff: + case LibFunc::modfl: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::memcpy: + case LibFunc::memccpy: + case LibFunc::memmove: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::memalign: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::mkdir: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::mktime: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::realloc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::read: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "read" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::rewind: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::rmdir: + case LibFunc::remove: + case LibFunc::realpath: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::rename: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::readlink: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::write: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "write" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::bcopy: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::bcmp: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::bzero: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::calloc: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::chmod: + case LibFunc::chown: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::ctermid: + case LibFunc::clearerr: + case LibFunc::closedir: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::atoi: + case LibFunc::atol: + case LibFunc::atof: + case LibFunc::atoll: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::access: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::fopen: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fdopen: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::feof: + case LibFunc::free: + case LibFunc::fseek: + case LibFunc::ftell: + case LibFunc::fgetc: + case LibFunc::fseeko: + case LibFunc::ftello: + case LibFunc::fileno: + case LibFunc::fflush: + case LibFunc::fclose: + case LibFunc::fsetpos: + case LibFunc::flockfile: + case LibFunc::funlockfile: + case LibFunc::ftrylockfile: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::ferror: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F); + return Changed; + case LibFunc::fputc: + case LibFunc::fstat: + case LibFunc::frexp: + case LibFunc::frexpf: + case LibFunc::frexpl: + case LibFunc::fstatvfs: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::fgets: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 3); + return Changed; + case LibFunc::fread: + if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 4); + return Changed; + case LibFunc::fwrite: + if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 4); + return Changed; + case LibFunc::fputs: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::fscanf: + case LibFunc::fprintf: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fgetpos: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::getc: + case LibFunc::getlogin_r: + case LibFunc::getc_unlocked: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::getenv: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::gets: + case LibFunc::getchar: + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::getitimer: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::getpwnam: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::ungetc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::uname: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::unlink: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::unsetenv: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::utime: + case LibFunc::utimes: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::putc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::puts: + case LibFunc::printf: + case LibFunc::perror: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::pread: + if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "pread" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::pwrite: + if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "pwrite" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::putchar: + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::popen: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::pclose: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::vscanf: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::vsscanf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::vfscanf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::valloc: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::vprintf: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::vfprintf: + case LibFunc::vsprintf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::vsnprintf: + if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 3); + Changed |= setOnlyReadsMemory(F, 3); + return Changed; + case LibFunc::open: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + // May throw; "open" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::opendir: + if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::tmpfile: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::times: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::htonl: + case LibFunc::htons: + case LibFunc::ntohl: + case LibFunc::ntohs: + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAccessMemory(F); + return Changed; + case LibFunc::lstat: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::lchown: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::qsort: + if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) + return false; + // May throw; places call through function pointer. + Changed |= setDoesNotCapture(F, 4); + return Changed; + case LibFunc::dunder_strdup: + case LibFunc::dunder_strndup: + if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::dunder_strtok_r: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::under_IO_getc: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::under_IO_putc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::dunder_isoc99_scanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::stat64: + case LibFunc::lstat64: + case LibFunc::statvfs64: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::dunder_isoc99_sscanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fopen64: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fseeko64: + case LibFunc::ftello64: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::tmpfile64: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::fstat64: + case LibFunc::fstatvfs64: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::open64: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + // May throw; "open" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::gettimeofday: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + // Currently some platforms have the restrict keyword on the arguments to + // gettimeofday. To be conservative, do not add noalias to gettimeofday's + // arguments. + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + + default: + // FIXME: It'd be really nice to cover all the library functions we're + // aware of here. + return false; + } +} + +static bool inferAllPrototypeAttributes(Module &M, + const TargetLibraryInfo &TLI) { + bool Changed = false; + + for (Function &F : M.functions()) + // We only infer things using the prototype if the definition isn't around + // to analyze directly. + if (F.isDeclaration()) + Changed |= inferPrototypeAttributes(F, TLI); + + return Changed; +} + +PreservedAnalyses InferFunctionAttrsPass::run(Module &M, + AnalysisManager<Module> *AM) { + auto &TLI = AM->getResult<TargetLibraryAnalysis>(M); + + if (!inferAllPrototypeAttributes(M, TLI)) + // If we didn't infer anything, preserve all analyses. + return PreservedAnalyses::all(); + + // Otherwise, we may have changed fundamental function attributes, so clear + // out all the passes. + return PreservedAnalyses::none(); +} + +namespace { +struct InferFunctionAttrsLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + InferFunctionAttrsLegacyPass() : ModulePass(ID) { + initializeInferFunctionAttrsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + } + + bool runOnModule(Module &M) override { + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + return inferAllPrototypeAttributes(M, TLI); + } +}; +} + +char InferFunctionAttrsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs", + "Infer set function attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs", + "Infer set function attributes", false, false) + +Pass *llvm::createInferFunctionAttrsLegacyPass() { + return new InferFunctionAttrsLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp index dc56a02e7b7d..1704bfea0b86 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp @@ -14,10 +14,10 @@ #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -35,17 +35,15 @@ namespace { /// \brief Inliner pass which only handles "always inline" functions. class AlwaysInliner : public Inliner { - InlineCostAnalysis *ICA; public: // Use extremely low threshold. - AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), - ICA(nullptr) { + AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true) { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } AlwaysInliner(bool InsertLifetime) - : Inliner(ID, -2000000000, InsertLifetime), ICA(nullptr) { + : Inliner(ID, -2000000000, InsertLifetime) { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } @@ -53,9 +51,6 @@ public: InlineCost getInlineCost(CallSite CS) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnSCC(CallGraphSCC &SCC) override; - using llvm::Pass::doFinalization; bool doFinalization(CallGraph &CG) override { return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true); @@ -67,10 +62,9 @@ public: char AlwaysInliner::ID = 0; INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) @@ -99,19 +93,8 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) { // that are viable for inlining. FIXME: We shouldn't even get here for // declarations. if (Callee && !Callee->isDeclaration() && - CS.hasFnAttr(Attribute::AlwaysInline) && - ICA->isInlineViable(*Callee)) + CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee)) return InlineCost::getAlways(); return InlineCost::getNever(); } - -bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) { - ICA = &getAnalysis<InlineCostAnalysis>(); - return Inliner::runOnSCC(SCC); -} - -void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<InlineCostAnalysis>(); - Inliner::getAnalysisUsage(AU); -} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp index 9b01d81b3c7c..45609f891ed8 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -23,6 +23,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/InlinerPass.h" using namespace llvm; @@ -37,26 +38,30 @@ namespace { /// inliner pass and the always inliner pass. The two passes use different cost /// analyses to determine when to inline. class SimpleInliner : public Inliner { - InlineCostAnalysis *ICA; public: - SimpleInliner() : Inliner(ID), ICA(nullptr) { + SimpleInliner() : Inliner(ID) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } SimpleInliner(int Threshold) - : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) { + : Inliner(ID, Threshold, /*InsertLifetime*/ true) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } static char ID; // Pass identification, replacement for typeid InlineCost getInlineCost(CallSite CS) override { - return ICA->getInlineCost(CS, getInlineThreshold(CS)); + Function *Callee = CS.getCalledFunction(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + return llvm::getInlineCost(CS, getInlineThreshold(CS), TTI, ACT); } bool runOnSCC(CallGraphSCC &SCC) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; }; static int computeThresholdFromOptLevels(unsigned OptLevel, @@ -75,10 +80,10 @@ static int computeThresholdFromOptLevels(unsigned OptLevel, char SimpleInliner::ID = 0; INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining", false, false) @@ -95,11 +100,11 @@ Pass *llvm::createFunctionInliningPass(unsigned OptLevel, } bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { - ICA = &getAnalysis<InlineCostAnalysis>(); + TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); return Inliner::runOnSCC(SCC); } void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<InlineCostAnalysis>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); Inliner::getAnalysisUsage(AU); } diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index 5273c3dc3ca2..bbe5f8761d5f 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -64,20 +65,22 @@ ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225), // Threshold to use when optsize is specified (and there is no -inline-limit). const int OptSizeThreshold = 75; -Inliner::Inliner(char &ID) - : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {} +Inliner::Inliner(char &ID) + : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) { +} Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime) - : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? - InlineLimit : Threshold), - InsertLifetime(InsertLifetime) {} + : CallGraphSCCPass(ID), + InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? InlineLimit + : Threshold), + InsertLifetime(InsertLifetime) {} /// For this class, we declare that we require and preserve the call graph. /// If the derived class implements this method, it should /// always explicitly call the implementation here. void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -85,39 +88,6 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { typedef DenseMap<ArrayType*, std::vector<AllocaInst*> > InlinedArrayAllocasTy; -/// \brief If the inlined function had a higher stack protection level than the -/// calling function, then bump up the caller's stack protection level. -static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { - // If upgrading the SSP attribute, clear out the old SSP Attributes first. - // Having multiple SSP attributes doesn't actually hurt, but it adds useless - // clutter to the IR. - AttrBuilder B; - B.addAttribute(Attribute::StackProtect) - .addAttribute(Attribute::StackProtectStrong) - .addAttribute(Attribute::StackProtectReq); - AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(), - AttributeSet::FunctionIndex, - B); - - if (Callee->hasFnAttribute(Attribute::SafeStack)) { - Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); - Caller->addFnAttr(Attribute::SafeStack); - } else if (Callee->hasFnAttribute(Attribute::StackProtectReq) && - !Caller->hasFnAttribute(Attribute::SafeStack)) { - Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); - Caller->addFnAttr(Attribute::StackProtectReq); - } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) && - !Caller->hasFnAttribute(Attribute::SafeStack) && - !Caller->hasFnAttribute(Attribute::StackProtectReq)) { - Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); - Caller->addFnAttr(Attribute::StackProtectStrong); - } else if (Callee->hasFnAttribute(Attribute::StackProtect) && - !Caller->hasFnAttribute(Attribute::SafeStack) && - !Caller->hasFnAttribute(Attribute::StackProtectReq) && - !Caller->hasFnAttribute(Attribute::StackProtectStrong)) - Caller->addFnAttr(Attribute::StackProtect); -} - /// If it is possible to inline the specified call site, /// do so and update the CallGraph for this operation. /// @@ -126,18 +96,26 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { /// available from other functions inlined into the caller. If we are able to /// inline this call site we attempt to reuse already available allocas or add /// any new allocas to the set if not possible. -static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, +static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI, InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory, bool InsertLifetime) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); + // We need to manually construct BasicAA directly in order to disable + // its use of other function analyses. + BasicAAResult BAR(createLegacyPMBasicAAResult(P, *Callee)); + + // Construct our own AA results for this function. We do this manually to + // work around the limitations of the legacy pass manager. + AAResults AAR(createLegacyPMAAResults(P, *Callee, BAR)); + // Try to inline the function. Get the list of static allocas that were // inlined. - if (!InlineFunction(CS, IFI, InsertLifetime)) + if (!InlineFunction(CS, IFI, &AAR, InsertLifetime)) return false; - AdjustCallerSSPLevel(Caller, Callee); + AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee); // Look at all of the allocas that we inlined through this call site. If we // have already inlined other allocas through other calls into this function, @@ -219,6 +197,14 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: " << *AvailableAlloca << '\n'); + // Move affected dbg.declare calls immediately after the new alloca to + // avoid the situation when a dbg.declare preceeds its alloca. + if (auto *L = LocalAsMetadata::getIfExists(AI)) + if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L)) + for (User *U : MDV->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDI->moveBefore(AvailableAlloca->getNextNode()); + AI->replaceAllUsesWith(AvailableAlloca); if (Align1 != Align2) { @@ -258,39 +244,64 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, } unsigned Inliner::getInlineThreshold(CallSite CS) const { - int thres = InlineThreshold; // -inline-threshold or else selected by - // overall opt level + int Threshold = InlineThreshold; // -inline-threshold or else selected by + // overall opt level // If -inline-threshold is not given, listen to the optsize attribute when it // would decrease the threshold. Function *Caller = CS.getCaller(); bool OptSize = Caller && !Caller->isDeclaration() && + // FIXME: Use Function::optForSize(). Caller->hasFnAttribute(Attribute::OptimizeForSize); if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && - OptSizeThreshold < thres) - thres = OptSizeThreshold; + OptSizeThreshold < Threshold) + Threshold = OptSizeThreshold; - // Listen to the inlinehint attribute when it would increase the threshold - // and the caller does not need to minimize its size. Function *Callee = CS.getCalledFunction(); - bool InlineHint = Callee && !Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::InlineHint); - if (InlineHint && HintThreshold > thres && - !Caller->hasFnAttribute(Attribute::MinSize)) - thres = HintThreshold; + if (!Callee || Callee->isDeclaration()) + return Threshold; + + // If profile information is available, use that to adjust threshold of hot + // and cold functions. + // FIXME: The heuristic used below for determining hotness and coldness are + // based on preliminary SPEC tuning and may not be optimal. Replace this with + // a well-tuned heuristic based on *callsite* hotness and not callee hotness. + uint64_t FunctionCount = 0, MaxFunctionCount = 0; + bool HasPGOCounts = false; + if (Callee->getEntryCount() && + Callee->getParent()->getMaximumFunctionCount()) { + HasPGOCounts = true; + FunctionCount = Callee->getEntryCount().getValue(); + MaxFunctionCount = + Callee->getParent()->getMaximumFunctionCount().getValue(); + } - // Listen to the cold attribute when it would decrease the threshold. - bool ColdCallee = Callee && !Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::Cold); + // Listen to the inlinehint attribute or profile based hotness information + // when it would increase the threshold and the caller does not need to + // minimize its size. + bool InlineHint = + Callee->hasFnAttribute(Attribute::InlineHint) || + (HasPGOCounts && + FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); + if (InlineHint && HintThreshold > Threshold && + !Caller->hasFnAttribute(Attribute::MinSize)) + Threshold = HintThreshold; + + // Listen to the cold attribute or profile based coldness information + // when it would decrease the threshold. + bool ColdCallee = + Callee->hasFnAttribute(Attribute::Cold) || + (HasPGOCounts && + FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount)); // Command line argument for InlineLimit will override the default // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, // do not use the default cold threshold even if it is smaller. if ((InlineLimit.getNumOccurrences() == 0 || ColdThreshold.getNumOccurrences() > 0) && ColdCallee && - ColdThreshold < thres) - thres = ColdThreshold; + ColdThreshold < Threshold) + Threshold = ColdThreshold; - return thres; + return Threshold; } static void emitAnalysis(CallSite CS, const Twine &Msg) { @@ -430,10 +441,8 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); - AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); + ACT = &getAnalysis<AssumptionCacheTracker>(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SmallPtrSet<Function*, 8> SCCFunctions; DEBUG(dbgs() << "Inliner visiting SCC:"); @@ -469,8 +478,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // If this is a direct call to an external function, we can never inline // it. If it is an indirect call, inlining may resolve it to be a // direct call, so we keep it. - if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration()) - continue; + if (Function *Callee = CS.getCalledFunction()) + if (Callee->isDeclaration()) + continue; CallSites.push_back(std::make_pair(CS, -1)); } @@ -492,7 +502,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, AA, ACT); + InlineFunctionInfo InlineInfo(&CG, ACT); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -513,7 +523,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // just delete the call instead of trying to inline it, regardless of // size. This happens because IPSCCP propagates the result out of the // call and then we're left with the dead call. - if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) { + if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) { DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction() << "\n"); // Update the call graph by deleting the edge from Callee to Caller. @@ -550,7 +560,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { } // Attempt to inline the function. - if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, + if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID, InsertLifetime)) { emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, Twine(Callee->getName() + @@ -647,8 +657,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // Scan for all of the functions, looking for ones that should now be removed // from the program. Insert the dead ones in the FunctionsToRemove set. - for (auto I : CG) { - CallGraphNode *CGN = I.second; + for (const auto &I : CG) { + CallGraphNode *CGN = I.second.get(); Function *F = CGN->getFunction(); if (!F || F->isDeclaration()) continue; diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp index 7950163f757d..21bb5d000bc7 100644 --- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp @@ -60,6 +60,10 @@ namespace { explicit InternalizePass(); explicit InternalizePass(ArrayRef<const char *> ExportList); void LoadFile(const char *Filename); + bool maybeInternalize(GlobalValue &GV, + const std::set<const Comdat *> &ExternalComdats); + void checkComdatVisibility(GlobalValue &GV, + std::set<const Comdat *> &ExternalComdats); bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -105,40 +109,85 @@ void InternalizePass::LoadFile(const char *Filename) { } } -static bool shouldInternalize(const GlobalValue &GV, - const std::set<std::string> &ExternalNames) { +static bool isExternallyVisible(const GlobalValue &GV, + const std::set<std::string> &ExternalNames) { // Function must be defined here if (GV.isDeclaration()) - return false; + return true; // Available externally is really just a "declaration with a body". if (GV.hasAvailableExternallyLinkage()) - return false; + return true; // Assume that dllexported symbols are referenced elsewhere if (GV.hasDLLExportStorageClass()) - return false; - - // Already has internal linkage - if (GV.hasLocalLinkage()) - return false; + return true; // Marked to keep external? - if (ExternalNames.count(GV.getName())) - return false; + if (!GV.hasLocalLinkage() && ExternalNames.count(GV.getName())) + return true; + + return false; +} +// Internalize GV if it is possible to do so, i.e. it is not externally visible +// and is not a member of an externally visible comdat. +bool InternalizePass::maybeInternalize( + GlobalValue &GV, const std::set<const Comdat *> &ExternalComdats) { + if (Comdat *C = GV.getComdat()) { + if (ExternalComdats.count(C)) + return false; + + // If a comdat is not externally visible we can drop it. + if (auto GO = dyn_cast<GlobalObject>(&GV)) + GO->setComdat(nullptr); + + if (GV.hasLocalLinkage()) + return false; + } else { + if (GV.hasLocalLinkage()) + return false; + + if (isExternallyVisible(GV, ExternalNames)) + return false; + } + + GV.setVisibility(GlobalValue::DefaultVisibility); + GV.setLinkage(GlobalValue::InternalLinkage); return true; } +// If GV is part of a comdat and is externally visible, keep track of its +// comdat so that we don't internalize any of its members. +void InternalizePass::checkComdatVisibility( + GlobalValue &GV, std::set<const Comdat *> &ExternalComdats) { + Comdat *C = GV.getComdat(); + if (!C) + return; + + if (isExternallyVisible(GV, ExternalNames)) + ExternalComdats.insert(C); +} + bool InternalizePass::runOnModule(Module &M) { CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>(); CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr; CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr; - bool Changed = false; SmallPtrSet<GlobalValue *, 8> Used; collectUsedGlobalVariables(M, Used, false); + // Collect comdat visiblity information for the module. + std::set<const Comdat *> ExternalComdats; + if (!M.getComdatSymbolTable().empty()) { + for (Function &F : M) + checkComdatVisibility(F, ExternalComdats); + for (GlobalVariable &GV : M.globals()) + checkComdatVisibility(GV, ExternalComdats); + for (GlobalAlias &GA : M.aliases()) + checkComdatVisibility(GA, ExternalComdats); + } + // We must assume that globals in llvm.used have a reference that not even // the linker can see, so we don't internalize them. // For llvm.compiler.used the situation is a bit fuzzy. The assembler and @@ -153,20 +202,16 @@ bool InternalizePass::runOnModule(Module &M) { } // Mark all functions not in the api as internal. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (!shouldInternalize(*I, ExternalNames)) + for (Function &I : M) { + if (!maybeInternalize(I, ExternalComdats)) continue; - I->setVisibility(GlobalValue::DefaultVisibility); - I->setLinkage(GlobalValue::InternalLinkage); - if (ExternalNode) // Remove a callgraph edge from the external node to this function. - ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); + ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]); - Changed = true; ++NumFunctions; - DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); + DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n"); } // Never internalize the llvm.used symbol. It is used to implement @@ -191,12 +236,9 @@ bool InternalizePass::runOnModule(Module &M) { // internal as well. for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { - if (!shouldInternalize(*I, ExternalNames)) + if (!maybeInternalize(*I, ExternalComdats)) continue; - I->setVisibility(GlobalValue::DefaultVisibility); - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; ++NumGlobals; DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); } @@ -204,17 +246,20 @@ bool InternalizePass::runOnModule(Module &M) { // Mark all aliases that are not in the api as internal as well. for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; ++I) { - if (!shouldInternalize(*I, ExternalNames)) + if (!maybeInternalize(*I, ExternalComdats)) continue; - I->setVisibility(GlobalValue::DefaultVisibility); - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; ++NumAliases; DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); } - return Changed; + // We do not keep track of whether this pass changed the module because + // it adds unnecessary complexity: + // 1) This pass will generally be near the start of the pass pipeline, so + // there will be no analyses to invalidate. + // 2) This pass will most likely end up changing the module and it isn't worth + // worrying about optimizing the case where the module is unchanged. + return true; } ModulePass *llvm::createInternalizePass() { return new InternalizePass(); } diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp index 41334ca5b429..8e4ad642ddd5 100644 --- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -43,12 +43,13 @@ namespace { initializeLoopExtractorPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(BreakCriticalEdgesID); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); } }; } @@ -79,7 +80,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single", // Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); } -bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; @@ -92,6 +93,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { return false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); bool Changed = false; // If there is more than one top-level loop in this function, extract all of @@ -120,14 +122,14 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { } if (ShouldExtractLoop) { - // We must omit landing pads. Landing pads must accompany the invoke + // We must omit EH pads. EH pads must accompany the invoke // instruction. But this would result in a loop in the extracted // function. An infinite cycle occurs when it tries to extract that loop as // well. SmallVector<BasicBlock*, 8> ExitBlocks; L->getExitBlocks(ExitBlocks); for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (ExitBlocks[i]->isLandingPad()) { + if (ExitBlocks[i]->isEHPad()) { ShouldExtractLoop = false; break; } @@ -141,7 +143,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. - LPM.deleteLoopFromQueue(L); + LI.updateUnloop(L); } ++NumExtracted; } @@ -259,7 +261,7 @@ bool BlockExtractorPass::runOnModule(Module &M) { // Figure out which index the basic block is in its function. Function::iterator BBI = MF->begin(); std::advance(BBI, std::distance(F->begin(), Function::iterator(BB))); - TranslatedBlocksToNotExtract.insert(BBI); + TranslatedBlocksToNotExtract.insert(&*BBI); } while (!BlocksToNotExtractByName.empty()) { @@ -278,7 +280,7 @@ bool BlockExtractorPass::runOnModule(Module &M) { BasicBlock &BB = *BI; if (BB.getName() != BlockName) continue; - TranslatedBlocksToNotExtract.insert(BI); + TranslatedBlocksToNotExtract.insert(&*BI); } } @@ -291,8 +293,8 @@ bool BlockExtractorPass::runOnModule(Module &M) { for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { SplitLandingPadPreds(&*F); for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - if (!TranslatedBlocksToNotExtract.count(BB)) - BlocksToExtract.push_back(BB); + if (!TranslatedBlocksToNotExtract.count(&*BB)) + BlocksToExtract.push_back(&*BB); } for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) { diff --git a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp index c6795c623eff..7b515745c312 100644 --- a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp @@ -19,6 +19,8 @@ #include "llvm/ADT/Triple.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -26,6 +28,8 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -59,9 +63,9 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { bool BitSetInfo::containsValue( const DataLayout &DL, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V, uint64_t COffset) const { - if (auto GV = dyn_cast<GlobalVariable>(V)) { + if (auto GV = dyn_cast<GlobalObject>(V)) { auto I = GlobalLayout.find(GV); if (I == GlobalLayout.end()) return false; @@ -90,6 +94,21 @@ bool BitSetInfo::containsValue( return false; } +void BitSetInfo::print(raw_ostream &OS) const { + OS << "offset " << ByteOffset << " size " << BitSize << " align " + << (1 << AlignLog2); + + if (isAllOnes()) { + OS << " all-ones\n"; + return; + } + + OS << " { "; + for (uint64_t B : Bits) + OS << B << ' '; + OS << "}\n"; +} + BitSetInfo BitSetBuilder::build() { if (Min > Max) Min = 0; @@ -193,34 +212,48 @@ struct LowerBitSets : public ModulePass { Module *M; bool LinkerSubsectionsViaSymbols; + Triple::ArchType Arch; + Triple::ObjectFormatType ObjectFormat; IntegerType *Int1Ty; IntegerType *Int8Ty; IntegerType *Int32Ty; Type *Int32PtrTy; IntegerType *Int64Ty; - Type *IntPtrTy; + IntegerType *IntPtrTy; // The llvm.bitsets named metadata. NamedMDNode *BitSetNM; - // Mapping from bitset mdstrings to the call sites that test them. - DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites; + // Mapping from bitset identifiers to the call sites that test them. + DenseMap<Metadata *, std::vector<CallInst *>> BitSetTestCallSites; std::vector<ByteArrayInfo> ByteArrayInfos; BitSetInfo - buildBitSet(MDString *BitSet, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); + buildBitSet(Metadata *BitSet, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); ByteArrayInfo *createByteArray(BitSetInfo &BSI); void allocateByteArrays(); Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI, Value *BitOffset); + void lowerBitSetCalls(ArrayRef<Metadata *> BitSets, + Constant *CombinedGlobalAddr, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); Value * lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, - GlobalVariable *CombinedGlobal, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); - void buildBitSetsFromGlobals(const std::vector<MDString *> &BitSets, - const std::vector<GlobalVariable *> &Globals); + Constant *CombinedGlobal, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); + void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> BitSets, + ArrayRef<GlobalVariable *> Globals); + unsigned getJumpTableEntrySize(); + Type *getJumpTableEntryType(); + Constant *createJumpTableEntry(GlobalObject *Src, Function *Dest, + unsigned Distance); + void verifyBitSetMDNode(MDNode *Op); + void buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets, + ArrayRef<Function *> Functions); + void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> BitSets, + ArrayRef<GlobalObject *> Globals); bool buildBitSets(); bool eraseBitSetMetadata(); @@ -228,7 +261,7 @@ struct LowerBitSets : public ModulePass { bool runOnModule(Module &M) override; }; -} // namespace +} // anonymous namespace INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets", "Lower bitset metadata", false, false) @@ -244,6 +277,8 @@ bool LowerBitSets::doInitialization(Module &Mod) { Triple TargetTriple(M->getTargetTriple()); LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); + Arch = TargetTriple.getArch(); + ObjectFormat = TargetTriple.getObjectFormat(); Int1Ty = Type::getInt1Ty(M->getContext()); Int8Ty = Type::getInt8Ty(M->getContext()); @@ -262,8 +297,8 @@ bool LowerBitSets::doInitialization(Module &Mod) { /// Build a bit set for BitSet using the object layouts in /// GlobalLayout. BitSetInfo LowerBitSets::buildBitSet( - MDString *BitSet, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + Metadata *BitSet, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { BitSetBuilder BSB; // Compute the byte offset of each element of this bitset. @@ -271,8 +306,11 @@ BitSetInfo LowerBitSets::buildBitSet( for (MDNode *Op : BitSetNM->operands()) { if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) continue; - auto OpGlobal = dyn_cast<GlobalVariable>( - cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + Constant *OpConst = + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue(); + if (auto GA = dyn_cast<GlobalAlias>(OpConst)) + OpConst = GA->getAliasee(); + auto OpGlobal = dyn_cast<GlobalObject>(OpConst); if (!OpGlobal) continue; uint64_t Offset = @@ -360,9 +398,8 @@ void LowerBitSets::allocateByteArrays() { if (LinkerSubsectionsViaSymbols) { BAI->ByteArray->replaceAllUsesWith(GEP); } else { - GlobalAlias *Alias = - GlobalAlias::create(PointerType::getUnqual(Int8Ty), - GlobalValue::PrivateLinkage, "bits", GEP, M); + GlobalAlias *Alias = GlobalAlias::create( + Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, M); BAI->ByteArray->replaceAllUsesWith(Alias); } BAI->ByteArray->eraseFromParent(); @@ -404,7 +441,7 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, // Each use of the byte array uses a different alias. This makes the // backend less likely to reuse previously computed byte array addresses, // improving the security of the CFI mechanism based on this pass. - ByteArray = GlobalAlias::create(BAI->ByteArray->getType(), + ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0, GlobalValue::PrivateLinkage, "bits_use", ByteArray, M); } @@ -421,17 +458,16 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, /// replace the call with. Value *LowerBitSets::lowerBitSetCall( CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, - GlobalVariable *CombinedGlobal, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + Constant *CombinedGlobalIntAddr, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { Value *Ptr = CI->getArgOperand(0); const DataLayout &DL = M->getDataLayout(); if (BSI.containsValue(DL, GlobalLayout, Ptr)) - return ConstantInt::getTrue(CombinedGlobal->getParent()->getContext()); + return ConstantInt::getTrue(M->getContext()); - Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy); Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( - GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); + CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); BasicBlock *InitialBB = CI->getParent(); @@ -490,18 +526,19 @@ Value *LowerBitSets::lowerBitSetCall( /// Given a disjoint set of bitsets and globals, layout the globals, build the /// bit sets and lower the llvm.bitset.test calls. -void LowerBitSets::buildBitSetsFromGlobals( - const std::vector<MDString *> &BitSets, - const std::vector<GlobalVariable *> &Globals) { +void LowerBitSets::buildBitSetsFromGlobalVariables( + ArrayRef<Metadata *> BitSets, ArrayRef<GlobalVariable *> Globals) { // Build a new global with the combined contents of the referenced globals. + // This global is a struct whose even-indexed elements contain the original + // contents of the referenced globals and whose odd-indexed elements contain + // any padding required to align the next element to the next power of 2. std::vector<Constant *> GlobalInits; const DataLayout &DL = M->getDataLayout(); for (GlobalVariable *G : Globals) { GlobalInits.push_back(G->getInitializer()); - uint64_t InitSize = DL.getTypeAllocSize(G->getInitializer()->getType()); + uint64_t InitSize = DL.getTypeAllocSize(G->getValueType()); - // Compute the amount of padding required to align the next element to the - // next power of 2. + // Compute the amount of padding required. uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize; // Cap at 128 was found experimentally to have a good data/instruction @@ -515,34 +552,20 @@ void LowerBitSets::buildBitSetsFromGlobals( if (!GlobalInits.empty()) GlobalInits.pop_back(); Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits); - auto CombinedGlobal = + auto *CombinedGlobal = new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true, GlobalValue::PrivateLinkage, NewInit); - const StructLayout *CombinedGlobalLayout = - DL.getStructLayout(cast<StructType>(NewInit->getType())); + StructType *NewTy = cast<StructType>(NewInit->getType()); + const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy); // Compute the offsets of the original globals within the new global. - DenseMap<GlobalVariable *, uint64_t> GlobalLayout; + DenseMap<GlobalObject *, uint64_t> GlobalLayout; for (unsigned I = 0; I != Globals.size(); ++I) // Multiply by 2 to account for padding elements. GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); - // For each bitset in this disjoint set... - for (MDString *BS : BitSets) { - // Build the bitset. - BitSetInfo BSI = buildBitSet(BS, GlobalLayout); - - ByteArrayInfo *BAI = 0; - - // Lower each call to llvm.bitset.test for this bitset. - for (CallInst *CI : BitSetTestCallSites[BS]) { - ++NumBitSetCallsLowered; - Value *Lowered = lowerBitSetCall(CI, BSI, BAI, CombinedGlobal, GlobalLayout); - CI->replaceAllUsesWith(Lowered); - CI->eraseFromParent(); - } - } + lowerBitSetCalls(BitSets, CombinedGlobal, GlobalLayout); // Build aliases pointing to offsets into the combined global for each // global from which we built the combined global, and replace references @@ -556,9 +579,11 @@ void LowerBitSets::buildBitSetsFromGlobals( if (LinkerSubsectionsViaSymbols) { Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr); } else { - GlobalAlias *GAlias = - GlobalAlias::create(Globals[I]->getType(), Globals[I]->getLinkage(), - "", CombinedGlobalElemPtr, M); + assert(Globals[I]->getType()->getAddressSpace() == 0); + GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0, + Globals[I]->getLinkage(), "", + CombinedGlobalElemPtr, M); + GAlias->setVisibility(Globals[I]->getVisibility()); GAlias->takeName(Globals[I]); Globals[I]->replaceAllUsesWith(GAlias); } @@ -566,6 +591,331 @@ void LowerBitSets::buildBitSetsFromGlobals( } } +void LowerBitSets::lowerBitSetCalls( + ArrayRef<Metadata *> BitSets, Constant *CombinedGlobalAddr, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { + Constant *CombinedGlobalIntAddr = + ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy); + + // For each bitset in this disjoint set... + for (Metadata *BS : BitSets) { + // Build the bitset. + BitSetInfo BSI = buildBitSet(BS, GlobalLayout); + DEBUG({ + if (auto BSS = dyn_cast<MDString>(BS)) + dbgs() << BSS->getString() << ": "; + else + dbgs() << "<unnamed>: "; + BSI.print(dbgs()); + }); + + ByteArrayInfo *BAI = nullptr; + + // Lower each call to llvm.bitset.test for this bitset. + for (CallInst *CI : BitSetTestCallSites[BS]) { + ++NumBitSetCallsLowered; + Value *Lowered = + lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); + } + } +} + +void LowerBitSets::verifyBitSetMDNode(MDNode *Op) { + if (Op->getNumOperands() != 3) + report_fatal_error( + "All operands of llvm.bitsets metadata must have 3 elements"); + if (!Op->getOperand(1)) + return; + + auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); + if (!OpConstMD) + report_fatal_error("Bit set element must be a constant"); + auto OpGlobal = dyn_cast<GlobalObject>(OpConstMD->getValue()); + if (!OpGlobal) + return; + + if (OpGlobal->isThreadLocal()) + report_fatal_error("Bit set element may not be thread-local"); + if (OpGlobal->hasSection()) + report_fatal_error("Bit set element may not have an explicit section"); + + if (isa<GlobalVariable>(OpGlobal) && OpGlobal->isDeclarationForLinker()) + report_fatal_error("Bit set global var element must be a definition"); + + auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); + if (!OffsetConstMD) + report_fatal_error("Bit set element offset must be a constant"); + auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); + if (!OffsetInt) + report_fatal_error("Bit set element offset must be an integer constant"); +} + +static const unsigned kX86JumpTableEntrySize = 8; + +unsigned LowerBitSets::getJumpTableEntrySize() { + if (Arch != Triple::x86 && Arch != Triple::x86_64) + report_fatal_error("Unsupported architecture for jump tables"); + + return kX86JumpTableEntrySize; +} + +// Create a constant representing a jump table entry for the target. This +// consists of an instruction sequence containing a relative branch to Dest. The +// constant will be laid out at address Src+(Len*Distance) where Len is the +// target-specific jump table entry size. +Constant *LowerBitSets::createJumpTableEntry(GlobalObject *Src, Function *Dest, + unsigned Distance) { + if (Arch != Triple::x86 && Arch != Triple::x86_64) + report_fatal_error("Unsupported architecture for jump tables"); + + const unsigned kJmpPCRel32Code = 0xe9; + const unsigned kInt3Code = 0xcc; + + ConstantInt *Jmp = ConstantInt::get(Int8Ty, kJmpPCRel32Code); + + // Build a constant representing the displacement between the constant's + // address and Dest. This will resolve to a PC32 relocation referring to Dest. + Constant *DestInt = ConstantExpr::getPtrToInt(Dest, IntPtrTy); + Constant *SrcInt = ConstantExpr::getPtrToInt(Src, IntPtrTy); + Constant *Disp = ConstantExpr::getSub(DestInt, SrcInt); + ConstantInt *DispOffset = + ConstantInt::get(IntPtrTy, Distance * kX86JumpTableEntrySize + 5); + Constant *OffsetedDisp = ConstantExpr::getSub(Disp, DispOffset); + OffsetedDisp = ConstantExpr::getTruncOrBitCast(OffsetedDisp, Int32Ty); + + ConstantInt *Int3 = ConstantInt::get(Int8Ty, kInt3Code); + + Constant *Fields[] = { + Jmp, OffsetedDisp, Int3, Int3, Int3, + }; + return ConstantStruct::getAnon(Fields, /*Packed=*/true); +} + +Type *LowerBitSets::getJumpTableEntryType() { + if (Arch != Triple::x86 && Arch != Triple::x86_64) + report_fatal_error("Unsupported architecture for jump tables"); + + return StructType::get(M->getContext(), + {Int8Ty, Int32Ty, Int8Ty, Int8Ty, Int8Ty}, + /*Packed=*/true); +} + +/// Given a disjoint set of bitsets and functions, build a jump table for the +/// functions, build the bit sets and lower the llvm.bitset.test calls. +void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets, + ArrayRef<Function *> Functions) { + // Unlike the global bitset builder, the function bitset builder cannot + // re-arrange functions in a particular order and base its calculations on the + // layout of the functions' entry points, as we have no idea how large a + // particular function will end up being (the size could even depend on what + // this pass does!) Instead, we build a jump table, which is a block of code + // consisting of one branch instruction for each of the functions in the bit + // set that branches to the target function, and redirect any taken function + // addresses to the corresponding jump table entry. In the object file's + // symbol table, the symbols for the target functions also refer to the jump + // table entries, so that addresses taken outside the module will pass any + // verification done inside the module. + // + // In more concrete terms, suppose we have three functions f, g, h which are + // members of a single bitset, and a function foo that returns their + // addresses: + // + // f: + // mov 0, %eax + // ret + // + // g: + // mov 1, %eax + // ret + // + // h: + // mov 2, %eax + // ret + // + // foo: + // mov f, %eax + // mov g, %edx + // mov h, %ecx + // ret + // + // To create a jump table for these functions, we instruct the LLVM code + // generator to output a jump table in the .text section. This is done by + // representing the instructions in the jump table as an LLVM constant and + // placing them in a global variable in the .text section. The end result will + // (conceptually) look like this: + // + // f: + // jmp .Ltmp0 ; 5 bytes + // int3 ; 1 byte + // int3 ; 1 byte + // int3 ; 1 byte + // + // g: + // jmp .Ltmp1 ; 5 bytes + // int3 ; 1 byte + // int3 ; 1 byte + // int3 ; 1 byte + // + // h: + // jmp .Ltmp2 ; 5 bytes + // int3 ; 1 byte + // int3 ; 1 byte + // int3 ; 1 byte + // + // .Ltmp0: + // mov 0, %eax + // ret + // + // .Ltmp1: + // mov 1, %eax + // ret + // + // .Ltmp2: + // mov 2, %eax + // ret + // + // foo: + // mov f, %eax + // mov g, %edx + // mov h, %ecx + // ret + // + // Because the addresses of f, g, h are evenly spaced at a power of 2, in the + // normal case the check can be carried out using the same kind of simple + // arithmetic that we normally use for globals. + + assert(!Functions.empty()); + + // Build a simple layout based on the regular layout of jump tables. + DenseMap<GlobalObject *, uint64_t> GlobalLayout; + unsigned EntrySize = getJumpTableEntrySize(); + for (unsigned I = 0; I != Functions.size(); ++I) + GlobalLayout[Functions[I]] = I * EntrySize; + + // Create a constant to hold the jump table. + ArrayType *JumpTableType = + ArrayType::get(getJumpTableEntryType(), Functions.size()); + auto JumpTable = new GlobalVariable(*M, JumpTableType, + /*isConstant=*/true, + GlobalValue::PrivateLinkage, nullptr); + JumpTable->setSection(ObjectFormat == Triple::MachO + ? "__TEXT,__text,regular,pure_instructions" + : ".text"); + lowerBitSetCalls(BitSets, JumpTable, GlobalLayout); + + // Build aliases pointing to offsets into the jump table, and replace + // references to the original functions with references to the aliases. + for (unsigned I = 0; I != Functions.size(); ++I) { + Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( + ConstantExpr::getGetElementPtr( + JumpTableType, JumpTable, + ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, I)}), + Functions[I]->getType()); + if (LinkerSubsectionsViaSymbols || Functions[I]->isDeclarationForLinker()) { + Functions[I]->replaceAllUsesWith(CombinedGlobalElemPtr); + } else { + assert(Functions[I]->getType()->getAddressSpace() == 0); + GlobalAlias *GAlias = GlobalAlias::create(Functions[I]->getValueType(), 0, + Functions[I]->getLinkage(), "", + CombinedGlobalElemPtr, M); + GAlias->setVisibility(Functions[I]->getVisibility()); + GAlias->takeName(Functions[I]); + Functions[I]->replaceAllUsesWith(GAlias); + } + if (!Functions[I]->isDeclarationForLinker()) + Functions[I]->setLinkage(GlobalValue::PrivateLinkage); + } + + // Build and set the jump table's initializer. + std::vector<Constant *> JumpTableEntries; + for (unsigned I = 0; I != Functions.size(); ++I) + JumpTableEntries.push_back( + createJumpTableEntry(JumpTable, Functions[I], I)); + JumpTable->setInitializer( + ConstantArray::get(JumpTableType, JumpTableEntries)); +} + +void LowerBitSets::buildBitSetsFromDisjointSet( + ArrayRef<Metadata *> BitSets, ArrayRef<GlobalObject *> Globals) { + llvm::DenseMap<Metadata *, uint64_t> BitSetIndices; + llvm::DenseMap<GlobalObject *, uint64_t> GlobalIndices; + for (unsigned I = 0; I != BitSets.size(); ++I) + BitSetIndices[BitSets[I]] = I; + for (unsigned I = 0; I != Globals.size(); ++I) + GlobalIndices[Globals[I]] = I; + + // For each bitset, build a set of indices that refer to globals referenced by + // the bitset. + std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + // Op = { bitset name, global, offset } + if (!Op->getOperand(1)) + continue; + auto I = BitSetIndices.find(Op->getOperand(0)); + if (I == BitSetIndices.end()) + continue; + + auto OpGlobal = dyn_cast<GlobalObject>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + if (!OpGlobal) + continue; + BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); + } + } + + // Order the sets of indices by size. The GlobalLayoutBuilder works best + // when given small index sets first. + std::stable_sort( + BitSetMembers.begin(), BitSetMembers.end(), + [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { + return O1.size() < O2.size(); + }); + + // Create a GlobalLayoutBuilder and provide it with index sets as layout + // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as + // close together as possible. + GlobalLayoutBuilder GLB(Globals.size()); + for (auto &&MemSet : BitSetMembers) + GLB.addFragment(MemSet); + + // Build the bitsets from this disjoint set. + if (Globals.empty() || isa<GlobalVariable>(Globals[0])) { + // Build a vector of global variables with the computed layout. + std::vector<GlobalVariable *> OrderedGVs(Globals.size()); + auto OGI = OrderedGVs.begin(); + for (auto &&F : GLB.Fragments) { + for (auto &&Offset : F) { + auto GV = dyn_cast<GlobalVariable>(Globals[Offset]); + if (!GV) + report_fatal_error( + "Bit set may not contain both global variables and functions"); + *OGI++ = GV; + } + } + + buildBitSetsFromGlobalVariables(BitSets, OrderedGVs); + } else { + // Build a vector of functions with the computed layout. + std::vector<Function *> OrderedFns(Globals.size()); + auto OFI = OrderedFns.begin(); + for (auto &&F : GLB.Fragments) { + for (auto &&Offset : F) { + auto Fn = dyn_cast<Function>(Globals[Offset]); + if (!Fn) + report_fatal_error( + "Bit set may not contain both global variables and functions"); + *OFI++ = Fn; + } + } + + buildBitSetsFromFunctions(BitSets, OrderedFns); + } +} + /// Lower all bit sets in this module. bool LowerBitSets::buildBitSets() { Function *BitSetTestFunc = @@ -576,24 +926,36 @@ bool LowerBitSets::buildBitSets() { // Equivalence class set containing bitsets and the globals they reference. // This is used to partition the set of bitsets in the module into disjoint // sets. - typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>> + typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>> GlobalClassesTy; GlobalClassesTy GlobalClasses; + // Verify the bitset metadata and build a mapping from bitset identifiers to + // their last observed index in BitSetNM. This will used later to + // deterministically order the list of bitset identifiers. + llvm::DenseMap<Metadata *, unsigned> BitSetIdIndices; + if (BitSetNM) { + for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) { + MDNode *Op = BitSetNM->getOperand(I); + verifyBitSetMDNode(Op); + BitSetIdIndices[Op->getOperand(0)] = I; + } + } + for (const Use &U : BitSetTestFunc->uses()) { auto CI = cast<CallInst>(U.getUser()); auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); - if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata())) + if (!BitSetMDVal) report_fatal_error( - "Second argument of llvm.bitset.test must be metadata string"); - auto BitSet = cast<MDString>(BitSetMDVal->getMetadata()); + "Second argument of llvm.bitset.test must be metadata"); + auto BitSet = BitSetMDVal->getMetadata(); // Add the call site to the list of call sites for this bit set. We also use // BitSetTestCallSites to keep track of whether we have seen this bit set // before. If we have, we don't need to re-add the referenced globals to the // equivalence class. - std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator, + std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool> Ins = BitSetTestCallSites.insert( std::make_pair(BitSet, std::vector<CallInst *>())); @@ -608,31 +970,16 @@ bool LowerBitSets::buildBitSets() { if (!BitSetNM) continue; - // Verify the bitset metadata and add the referenced globals to the bitset's - // equivalence class. + // Add the referenced globals to the bitset's equivalence class. for (MDNode *Op : BitSetNM->operands()) { - if (Op->getNumOperands() != 3) - report_fatal_error( - "All operands of llvm.bitsets metadata must have 3 elements"); - if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) continue; - auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); - if (!OpConstMD) - report_fatal_error("Bit set element must be a constant"); - auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue()); + auto OpGlobal = dyn_cast<GlobalObject>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); if (!OpGlobal) continue; - auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); - if (!OffsetConstMD) - report_fatal_error("Bit set element offset must be a constant"); - auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); - if (!OffsetInt) - report_fatal_error( - "Bit set element offset must be an integer constant"); - CurSet = GlobalClasses.unionSets( CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal))); } @@ -641,79 +988,51 @@ bool LowerBitSets::buildBitSets() { if (GlobalClasses.empty()) return false; - // For each disjoint set we found... + // Build a list of disjoint sets ordered by their maximum BitSetNM index + // for determinism. + std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets; for (GlobalClassesTy::iterator I = GlobalClasses.begin(), E = GlobalClasses.end(); I != E; ++I) { if (!I->isLeader()) continue; - ++NumBitSetDisjointSets; - // Build the list of bitsets and referenced globals in this disjoint set. - std::vector<MDString *> BitSets; - std::vector<GlobalVariable *> Globals; - llvm::DenseMap<MDString *, uint64_t> BitSetIndices; - llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices; + unsigned MaxIndex = 0; for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); MI != GlobalClasses.member_end(); ++MI) { - if ((*MI).is<MDString *>()) { - BitSetIndices[MI->get<MDString *>()] = BitSets.size(); - BitSets.push_back(MI->get<MDString *>()); - } else { - GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size(); - Globals.push_back(MI->get<GlobalVariable *>()); - } + if ((*MI).is<Metadata *>()) + MaxIndex = std::max(MaxIndex, BitSetIdIndices[MI->get<Metadata *>()]); } + Sets.emplace_back(I, MaxIndex); + } + std::sort(Sets.begin(), Sets.end(), + [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1, + const std::pair<GlobalClassesTy::iterator, unsigned> &S2) { + return S1.second < S2.second; + }); - // For each bitset, build a set of indices that refer to globals referenced - // by the bitset. - std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); - if (BitSetNM) { - for (MDNode *Op : BitSetNM->operands()) { - // Op = { bitset name, global, offset } - if (!Op->getOperand(1)) - continue; - auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0))); - if (I == BitSetIndices.end()) - continue; - - auto OpGlobal = dyn_cast<GlobalVariable>( - cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); - if (!OpGlobal) - continue; - BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); - } + // For each disjoint set we found... + for (const auto &S : Sets) { + // Build the list of bitsets in this disjoint set. + std::vector<Metadata *> BitSets; + std::vector<GlobalObject *> Globals; + for (GlobalClassesTy::member_iterator MI = + GlobalClasses.member_begin(S.first); + MI != GlobalClasses.member_end(); ++MI) { + if ((*MI).is<Metadata *>()) + BitSets.push_back(MI->get<Metadata *>()); + else + Globals.push_back(MI->get<GlobalObject *>()); } - // Order the sets of indices by size. The GlobalLayoutBuilder works best - // when given small index sets first. - std::stable_sort( - BitSetMembers.begin(), BitSetMembers.end(), - [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { - return O1.size() < O2.size(); - }); - - // Create a GlobalLayoutBuilder and provide it with index sets as layout - // fragments. The GlobalLayoutBuilder tries to lay out members of fragments - // as close together as possible. - GlobalLayoutBuilder GLB(Globals.size()); - for (auto &&MemSet : BitSetMembers) - GLB.addFragment(MemSet); - - // Build a vector of globals with the computed layout. - std::vector<GlobalVariable *> OrderedGlobals(Globals.size()); - auto OGI = OrderedGlobals.begin(); - for (auto &&F : GLB.Fragments) - for (auto &&Offset : F) - *OGI++ = Globals[Offset]; - - // Order bitsets by name for determinism. - std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) { - return S1->getString() < S2->getString(); + // Order bitsets by BitSetNM index for determinism. This ordering is stable + // as there is a one-to-one mapping between metadata and indices. + std::sort(BitSets.begin(), BitSets.end(), [&](Metadata *M1, Metadata *M2) { + return BitSetIdIndices[M1] < BitSetIdIndices[M2]; }); - // Build the bitsets from this disjoint set. - buildBitSetsFromGlobals(BitSets, OrderedGlobals); + // Lower the bitsets in this disjoint set. + buildBitSetsFromDisjointSet(BitSets, Globals); } allocateByteArrays(); diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 2e3519eac6a5..8a209a18c540 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -27,6 +27,14 @@ // -- We define Function* container class with custom "operator<" (FunctionPtr). // -- "FunctionPtr" instances are stored in std::set collection, so every // std::set::insert operation will give you result in log(N) time. +// +// As an optimization, a hash of the function structure is calculated first, and +// two functions are only compared if they have the same hash. This hash is +// cheap to compute, and has the property that if function F == G according to +// the comparison function, then hash(F) == hash(G). This consistency property +// is critical to ensuring all possible merging opportunities are exploited. +// Collisions in the hash affect the speed of the pass but not the correctness +// or determinism of the resulting transformation. // // When a match is found the functions are folded. If both functions are // overridable, we move the functionality into a new internal function and @@ -87,6 +95,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Hashing.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -97,12 +106,14 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <vector> + using namespace llvm; #define DEBUG_TYPE "mergefunc" @@ -121,21 +132,64 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck( namespace { +/// GlobalNumberState assigns an integer to each global value in the program, +/// which is used by the comparison routine to order references to globals. This +/// state must be preserved throughout the pass, because Functions and other +/// globals need to maintain their relative order. Globals are assigned a number +/// when they are first visited. This order is deterministic, and so the +/// assigned numbers are as well. When two functions are merged, neither number +/// is updated. If the symbols are weak, this would be incorrect. If they are +/// strong, then one will be replaced at all references to the other, and so +/// direct callsites will now see one or the other symbol, and no update is +/// necessary. Note that if we were guaranteed unique names, we could just +/// compare those, but this would not work for stripped bitcodes or for those +/// few symbols without a name. +class GlobalNumberState { + struct Config : ValueMapConfig<GlobalValue*> { + enum { FollowRAUW = false }; + }; + // Each GlobalValue is mapped to an identifier. The Config ensures when RAUW + // occurs, the mapping does not change. Tracking changes is unnecessary, and + // also problematic for weak symbols (which may be overwritten). + typedef ValueMap<GlobalValue *, uint64_t, Config> ValueNumberMap; + ValueNumberMap GlobalNumbers; + // The next unused serial number to assign to a global. + uint64_t NextNumber; + public: + GlobalNumberState() : GlobalNumbers(), NextNumber(0) {} + uint64_t getNumber(GlobalValue* Global) { + ValueNumberMap::iterator MapIter; + bool Inserted; + std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber}); + if (Inserted) + NextNumber++; + return MapIter->second; + } + void clear() { + GlobalNumbers.clear(); + } +}; + /// FunctionComparator - Compares two functions to determine whether or not /// they will generate machine code with the same behaviour. DataLayout is /// used if available. The comparator always fails conservatively (erring on the /// side of claiming that two functions are different). class FunctionComparator { public: - FunctionComparator(const Function *F1, const Function *F2) - : FnL(F1), FnR(F2) {} + FunctionComparator(const Function *F1, const Function *F2, + GlobalNumberState* GN) + : FnL(F1), FnR(F2), GlobalNumbers(GN) {} /// Test whether the two functions have equivalent behaviour. int compare(); + /// Hash a function. Equivalent functions will have the same hash, and unequal + /// functions will have different hashes with high probability. + typedef uint64_t FunctionHash; + static FunctionHash functionHash(Function &); private: /// Test whether two basic blocks have equivalent behaviour. - int compare(const BasicBlock *BBL, const BasicBlock *BBR); + int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR); /// Constants comparison. /// Its analog to lexicographical comparison between hypothetical numbers @@ -241,6 +295,10 @@ private: /// If these properties are equal - compare their contents. int cmpConstants(const Constant *L, const Constant *R); + /// Compares two global values by number. Uses the GlobalNumbersState to + /// identify the same gobals across function calls. + int cmpGlobalValues(GlobalValue *L, GlobalValue *R); + /// Assign or look up previously assigned numbers for the two values, and /// return whether the numbers are equal. Numbers are assigned in the order /// visited. @@ -320,8 +378,9 @@ private: /// /// 1. If types are of different kind (different type IDs). /// Return result of type IDs comparison, treating them as numbers. - /// 2. If types are vectors or integers, compare Type* values as numbers. - /// 3. Types has same ID, so check whether they belongs to the next group: + /// 2. If types are integers, check that they have the same width. If they + /// are vectors, check that they have the same count and subtype. + /// 3. Types have the same ID, so check whether they are one of: /// * Void /// * Float /// * Double @@ -330,8 +389,7 @@ private: /// * PPC_FP128 /// * Label /// * Metadata - /// If so - return 0, yes - we can treat these types as equal only because - /// their IDs are same. + /// We can treat these types as equal whenever their IDs are same. /// 4. If Left and Right are pointers, return result of address space /// comparison (numbers comparison). We can treat pointer types of same /// address space as equal. @@ -343,11 +401,13 @@ private: int cmpTypes(Type *TyL, Type *TyR) const; int cmpNumbers(uint64_t L, uint64_t R) const; - int cmpAPInts(const APInt &L, const APInt &R) const; int cmpAPFloats(const APFloat &L, const APFloat &R) const; - int cmpStrings(StringRef L, StringRef R) const; + int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const; + int cmpMem(StringRef L, StringRef R) const; int cmpAttrs(const AttributeSet L, const AttributeSet R) const; + int cmpRangeMetadata(const MDNode* L, const MDNode* R) const; + int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const; // The two functions undergoing comparison. const Function *FnL, *FnR; @@ -386,30 +446,30 @@ private: /// could be operands from further BBs we didn't scan yet. /// So it's impossible to use dominance properties in general. DenseMap<const Value*, int> sn_mapL, sn_mapR; + + // The global state we will use + GlobalNumberState* GlobalNumbers; }; class FunctionNode { mutable AssertingVH<Function> F; - + FunctionComparator::FunctionHash Hash; public: - FunctionNode(Function *F) : F(F) {} + // Note the hash is recalculated potentially multiple times, but it is cheap. + FunctionNode(Function *F) + : F(F), Hash(FunctionComparator::functionHash(*F)) {} Function *getFunc() const { return F; } + FunctionComparator::FunctionHash getHash() const { return Hash; } /// Replace the reference to the function F by the function G, assuming their /// implementations are equal. void replaceBy(Function *G) const { - assert(!(*this < FunctionNode(G)) && !(FunctionNode(G) < *this) && - "The two functions must be equal"); - F = G; } - void release() { F = 0; } - bool operator<(const FunctionNode &RHS) const { - return (FunctionComparator(F, RHS.getFunc()).compare()) == -1; - } + void release() { F = nullptr; } }; -} +} // end anonymous namespace int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { if (L < R) return -1; @@ -426,13 +486,25 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const { } int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const { - if (int Res = cmpNumbers((uint64_t)&L.getSemantics(), - (uint64_t)&R.getSemantics())) + // Floats are ordered first by semantics (i.e. float, double, half, etc.), + // then by value interpreted as a bitstring (aka APInt). + const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics(); + if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL), + APFloat::semanticsPrecision(SR))) + return Res; + if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL), + APFloat::semanticsMaxExponent(SR))) + return Res; + if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL), + APFloat::semanticsMinExponent(SR))) + return Res; + if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL), + APFloat::semanticsSizeInBits(SR))) return Res; return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt()); } -int FunctionComparator::cmpStrings(StringRef L, StringRef R) const { +int FunctionComparator::cmpMem(StringRef L, StringRef R) const { // Prevent heavy comparison, compare sizes first. if (int Res = cmpNumbers(L.size(), R.size())) return Res; @@ -466,6 +538,59 @@ int FunctionComparator::cmpAttrs(const AttributeSet L, return 0; } +int FunctionComparator::cmpRangeMetadata(const MDNode* L, + const MDNode* R) const { + if (L == R) + return 0; + if (!L) + return -1; + if (!R) + return 1; + // Range metadata is a sequence of numbers. Make sure they are the same + // sequence. + // TODO: Note that as this is metadata, it is possible to drop and/or merge + // this data when considering functions to merge. Thus this comparison would + // return 0 (i.e. equivalent), but merging would become more complicated + // because the ranges would need to be unioned. It is not likely that + // functions differ ONLY in this metadata if they are actually the same + // function semantically. + if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands())) + return Res; + for (size_t I = 0; I < L->getNumOperands(); ++I) { + ConstantInt* LLow = mdconst::extract<ConstantInt>(L->getOperand(I)); + ConstantInt* RLow = mdconst::extract<ConstantInt>(R->getOperand(I)); + if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue())) + return Res; + } + return 0; +} + +int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L, + const Instruction *R) const { + ImmutableCallSite LCS(L); + ImmutableCallSite RCS(R); + + assert(LCS && RCS && "Must be calls or invokes!"); + assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!"); + + if (int Res = + cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles())) + return Res; + + for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) { + auto OBL = LCS.getOperandBundleAt(i); + auto OBR = RCS.getOperandBundleAt(i); + + if (int Res = OBL.getTagName().compare(OBR.getTagName())) + return Res; + + if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size())) + return Res; + } + + return 0; +} + /// Constants comparison: /// 1. Check whether type of L constant could be losslessly bitcasted to R /// type. @@ -500,9 +625,9 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { unsigned TyLWidth = 0; unsigned TyRWidth = 0; - if (const VectorType *VecTyL = dyn_cast<VectorType>(TyL)) + if (auto *VecTyL = dyn_cast<VectorType>(TyL)) TyLWidth = VecTyL->getBitWidth(); - if (const VectorType *VecTyR = dyn_cast<VectorType>(TyR)) + if (auto *VecTyR = dyn_cast<VectorType>(TyR)) TyRWidth = VecTyR->getBitWidth(); if (TyLWidth != TyRWidth) @@ -538,11 +663,29 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { if (!L->isNullValue() && R->isNullValue()) return -1; + auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L)); + auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R)); + if (GlobalValueL && GlobalValueR) { + return cmpGlobalValues(GlobalValueL, GlobalValueR); + } + if (int Res = cmpNumbers(L->getValueID(), R->getValueID())) return Res; + if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) { + const auto *SeqR = cast<ConstantDataSequential>(R); + // This handles ConstantDataArray and ConstantDataVector. Note that we + // compare the two raw data arrays, which might differ depending on the host + // endianness. This isn't a problem though, because the endiness of a module + // will affect the order of the constants, but this order is the same + // for a given input module and host platform. + return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues()); + } + switch (L->getValueID()) { - case Value::UndefValueVal: return TypesRes; + case Value::UndefValueVal: + case Value::ConstantTokenNoneVal: + return TypesRes; case Value::ConstantIntVal: { const APInt &LInt = cast<ConstantInt>(L)->getValue(); const APInt &RInt = cast<ConstantInt>(R)->getValue(); @@ -609,19 +752,55 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { } return 0; } - case Value::FunctionVal: - case Value::GlobalVariableVal: - case Value::GlobalAliasVal: - default: // Unknown constant, cast L and R pointers to numbers and compare. - return cmpNumbers((uint64_t)L, (uint64_t)R); + case Value::BlockAddressVal: { + const BlockAddress *LBA = cast<BlockAddress>(L); + const BlockAddress *RBA = cast<BlockAddress>(R); + if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction())) + return Res; + if (LBA->getFunction() == RBA->getFunction()) { + // They are BBs in the same function. Order by which comes first in the + // BB order of the function. This order is deterministic. + Function* F = LBA->getFunction(); + BasicBlock *LBB = LBA->getBasicBlock(); + BasicBlock *RBB = RBA->getBasicBlock(); + if (LBB == RBB) + return 0; + for(BasicBlock &BB : F->getBasicBlockList()) { + if (&BB == LBB) { + assert(&BB != RBB); + return -1; + } + if (&BB == RBB) + return 1; + } + llvm_unreachable("Basic Block Address does not point to a basic block in " + "its function."); + return -1; + } else { + // cmpValues said the functions are the same. So because they aren't + // literally the same pointer, they must respectively be the left and + // right functions. + assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR); + // cmpValues will tell us if these are equivalent BasicBlocks, in the + // context of their respective functions. + return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock()); + } } + default: // Unknown constant, abort. + DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n"); + llvm_unreachable("Constant ValueID not recognized."); + return -1; + } +} + +int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue* R) { + return cmpNumbers(GlobalNumbers->getNumber(L), GlobalNumbers->getNumber(R)); } /// cmpType - compares two types, /// defines total ordering among the types set. /// See method declaration comments for more details. int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { - PointerType *PTyL = dyn_cast<PointerType>(TyL); PointerType *PTyR = dyn_cast<PointerType>(TyR); @@ -642,10 +821,15 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { llvm_unreachable("Unknown type!"); // Fall through in Release mode. case Type::IntegerTyID: - case Type::VectorTyID: - // TyL == TyR would have returned true earlier. - return cmpNumbers((uint64_t)TyL, (uint64_t)TyR); - + return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(), + cast<IntegerType>(TyR)->getBitWidth()); + case Type::VectorTyID: { + VectorType *VTyL = cast<VectorType>(TyL), *VTyR = cast<VectorType>(TyR); + if (int Res = cmpNumbers(VTyL->getNumElements(), VTyR->getNumElements())) + return Res; + return cmpTypes(VTyL->getElementType(), VTyR->getElementType()); + } + // TyL == TyR would have returned true earlier, because types are uniqued. case Type::VoidTyID: case Type::FloatTyID: case Type::DoubleTyID: @@ -654,6 +838,7 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { case Type::PPC_FP128TyID: case Type::LabelTyID: case Type::MetadataTyID: + case Type::TokenTyID: return 0; case Type::PointerTyID: { @@ -759,8 +944,8 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope())) return Res; - return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range), - (uint64_t)cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); + return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range), + cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); } if (const StoreInst *SI = dyn_cast<StoreInst>(L)) { if (int Res = @@ -783,20 +968,24 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes())) return Res; - return cmpNumbers( - (uint64_t)CI->getMetadata(LLVMContext::MD_range), - (uint64_t)cast<CallInst>(R)->getMetadata(LLVMContext::MD_range)); + if (int Res = cmpOperandBundlesSchema(CI, R)) + return Res; + return cmpRangeMetadata( + CI->getMetadata(LLVMContext::MD_range), + cast<CallInst>(R)->getMetadata(LLVMContext::MD_range)); } - if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) { - if (int Res = cmpNumbers(CI->getCallingConv(), + if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) { + if (int Res = cmpNumbers(II->getCallingConv(), cast<InvokeInst>(R)->getCallingConv())) return Res; if (int Res = - cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes())) + cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes())) + return Res; + if (int Res = cmpOperandBundlesSchema(II, R)) return Res; - return cmpNumbers( - (uint64_t)CI->getMetadata(LLVMContext::MD_range), - (uint64_t)cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range)); + return cmpRangeMetadata( + II->getMetadata(LLVMContext::MD_range), + cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range)); } if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) { ArrayRef<unsigned> LIndices = IVI->getIndices(); @@ -876,9 +1065,8 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, if (GEPL->accumulateConstantOffset(DL, OffsetL) && GEPR->accumulateConstantOffset(DL, OffsetR)) return cmpAPInts(OffsetL, OffsetR); - - if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(), - (uint64_t)GEPR->getPointerOperand()->getType())) + if (int Res = cmpTypes(GEPL->getSourceElementType(), + GEPR->getSourceElementType())) return Res; if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands())) @@ -892,6 +1080,28 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, return 0; } +int FunctionComparator::cmpInlineAsm(const InlineAsm *L, + const InlineAsm *R) const { + // InlineAsm's are uniqued. If they are the same pointer, obviously they are + // the same, otherwise compare the fields. + if (L == R) + return 0; + if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType())) + return Res; + if (int Res = cmpMem(L->getAsmString(), R->getAsmString())) + return Res; + if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString())) + return Res; + if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects())) + return Res; + if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack())) + return Res; + if (int Res = cmpNumbers(L->getDialect(), R->getDialect())) + return Res; + llvm_unreachable("InlineAsm blocks were not uniqued."); + return 0; +} + /// Compare two values used by the two functions under pair-wise comparison. If /// this is the first time the values are seen, they're added to the mapping so /// that we will detect mismatches on next use. @@ -926,7 +1136,7 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) { const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R); if (InlineAsmL && InlineAsmR) - return cmpNumbers((uint64_t)L, (uint64_t)R); + return cmpInlineAsm(InlineAsmL, InlineAsmR); if (InlineAsmL) return 1; if (InlineAsmR) @@ -938,12 +1148,13 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) { return cmpNumbers(LeftSN.first->second, RightSN.first->second); } // Test whether two basic blocks have equivalent behaviour. -int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { +int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL, + const BasicBlock *BBR) { BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end(); BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end(); do { - if (int Res = cmpValues(InstL, InstR)) + if (int Res = cmpValues(&*InstL, &*InstR)) return Res; const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL); @@ -961,7 +1172,7 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { if (int Res = cmpGEPs(GEPL, GEPR)) return Res; } else { - if (int Res = cmpOperations(InstL, InstR)) + if (int Res = cmpOperations(&*InstL, &*InstR)) return Res; assert(InstL->getNumOperands() == InstR->getNumOperands()); @@ -970,11 +1181,8 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { Value *OpR = InstR->getOperand(i); if (int Res = cmpValues(OpL, OpR)) return Res; - if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID())) - return Res; - // TODO: Already checked in cmpOperation - if (int Res = cmpTypes(OpL->getType(), OpR->getType())) - return Res; + // cmpValues should ensure this is true. + assert(cmpTypes(OpL->getType(), OpR->getType()) == 0); } } @@ -990,7 +1198,6 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { // Test whether the two functions have equivalent behaviour. int FunctionComparator::compare() { - sn_mapL.clear(); sn_mapR.clear(); @@ -1001,7 +1208,7 @@ int FunctionComparator::compare() { return Res; if (FnL->hasGC()) { - if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC())) + if (int Res = cmpMem(FnL->getGC(), FnR->getGC())) return Res; } @@ -1009,7 +1216,7 @@ int FunctionComparator::compare() { return Res; if (FnL->hasSection()) { - if (int Res = cmpStrings(FnL->getSection(), FnR->getSection())) + if (int Res = cmpMem(FnL->getSection(), FnR->getSection())) return Res; } @@ -1033,7 +1240,7 @@ int FunctionComparator::compare() { ArgRI = FnR->arg_begin(), ArgLE = FnL->arg_end(); ArgLI != ArgLE; ++ArgLI, ++ArgRI) { - if (cmpValues(ArgLI, ArgRI) != 0) + if (cmpValues(&*ArgLI, &*ArgRI) != 0) llvm_unreachable("Arguments repeat!"); } @@ -1055,7 +1262,7 @@ int FunctionComparator::compare() { if (int Res = cmpValues(BBL, BBR)) return Res; - if (int Res = compare(BBL, BBR)) + if (int Res = cmpBasicBlocks(BBL, BBR)) return Res; const TerminatorInst *TermL = BBL->getTerminator(); @@ -1074,6 +1281,68 @@ int FunctionComparator::compare() { } namespace { +// Accumulate the hash of a sequence of 64-bit integers. This is similar to a +// hash of a sequence of 64bit ints, but the entire input does not need to be +// available at once. This interface is necessary for functionHash because it +// needs to accumulate the hash as the structure of the function is traversed +// without saving these values to an intermediate buffer. This form of hashing +// is not often needed, as usually the object to hash is just read from a +// buffer. +class HashAccumulator64 { + uint64_t Hash; +public: + // Initialize to random constant, so the state isn't zero. + HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; } + void add(uint64_t V) { + Hash = llvm::hashing::detail::hash_16_bytes(Hash, V); + } + // No finishing is required, because the entire hash value is used. + uint64_t getHash() { return Hash; } +}; +} // end anonymous namespace + +// A function hash is calculated by considering only the number of arguments and +// whether a function is varargs, the order of basic blocks (given by the +// successors of each basic block in depth first order), and the order of +// opcodes of each instruction within each of these basic blocks. This mirrors +// the strategy compare() uses to compare functions by walking the BBs in depth +// first order and comparing each instruction in sequence. Because this hash +// does not look at the operands, it is insensitive to things such as the +// target of calls and the constants used in the function, which makes it useful +// when possibly merging functions which are the same modulo constants and call +// targets. +FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) { + HashAccumulator64 H; + H.add(F.isVarArg()); + H.add(F.arg_size()); + + SmallVector<const BasicBlock *, 8> BBs; + SmallSet<const BasicBlock *, 16> VisitedBBs; + + // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(), + // accumulating the hash of the function "structure." (BB and opcode sequence) + BBs.push_back(&F.getEntryBlock()); + VisitedBBs.insert(BBs[0]); + while (!BBs.empty()) { + const BasicBlock *BB = BBs.pop_back_val(); + // This random value acts as a block header, as otherwise the partition of + // opcodes into BBs wouldn't affect the hash, only the order of the opcodes + H.add(45798); + for (auto &Inst : *BB) { + H.add(Inst.getOpcode()); + } + const TerminatorInst *Term = BB->getTerminator(); + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(Term->getSuccessor(i)).second) + continue; + BBs.push_back(Term->getSuccessor(i)); + } + } + return H.getHash(); +} + + +namespace { /// MergeFunctions finds functions which will generate identical machine code, /// by considering all pointer types to be equivalent. Once identified, @@ -1084,14 +1353,31 @@ class MergeFunctions : public ModulePass { public: static char ID; MergeFunctions() - : ModulePass(ID), HasGlobalAliases(false) { + : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(), + HasGlobalAliases(false) { initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override; private: - typedef std::set<FunctionNode> FnTreeType; + // The function comparison operator is provided here so that FunctionNodes do + // not need to become larger with another pointer. + class FunctionNodeCmp { + GlobalNumberState* GlobalNumbers; + public: + FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {} + bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const { + // Order first by hashes, then full function comparison. + if (LHS.getHash() != RHS.getHash()) + return LHS.getHash() < RHS.getHash(); + FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers); + return FCmp.compare() == -1; + } + }; + typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType; + + GlobalNumberState GlobalNumbers; /// A work queue of functions that may have been modified and should be /// analyzed again. @@ -1133,17 +1419,23 @@ private: void writeAlias(Function *F, Function *G); /// Replace function F with function G in the function tree. - void replaceFunctionInTree(FnTreeType::iterator &IterToF, Function *G); + void replaceFunctionInTree(const FunctionNode &FN, Function *G); /// The set of all distinct functions. Use the insert() and remove() methods - /// to modify it. + /// to modify it. The map allows efficient lookup and deferring of Functions. FnTreeType FnTree; + // Map functions to the iterators of the FunctionNode which contains them + // in the FnTree. This must be updated carefully whenever the FnTree is + // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid + // dangling iterators into FnTree. The invariant that preserves this is that + // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree. + ValueMap<Function*, FnTreeType::iterator> FNodesInTree; /// Whether or not the target supports global aliases. bool HasGlobalAliases; }; -} // end anonymous namespace +} // end anonymous namespace char MergeFunctions::ID = 0; INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) @@ -1166,8 +1458,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) { Function *F1 = cast<Function>(*I); Function *F2 = cast<Function>(*J); - int Res1 = FunctionComparator(F1, F2).compare(); - int Res2 = FunctionComparator(F2, F1).compare(); + int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare(); + int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare(); // If F1 <= F2, then F2 >= F1, otherwise report failure. if (Res1 != -Res2) { @@ -1188,8 +1480,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { continue; Function *F3 = cast<Function>(*K); - int Res3 = FunctionComparator(F1, F3).compare(); - int Res4 = FunctionComparator(F2, F3).compare(); + int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare(); + int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare(); bool Transitive = true; @@ -1227,11 +1519,33 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { bool MergeFunctions::runOnModule(Module &M) { bool Changed = false; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) - Deferred.push_back(WeakVH(I)); + // All functions in the module, ordered by hash. Functions with a unique + // hash value are easily eliminated. + std::vector<std::pair<FunctionComparator::FunctionHash, Function *>> + HashedFuncs; + for (Function &Func : M) { + if (!Func.isDeclaration() && !Func.hasAvailableExternallyLinkage()) { + HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func}); + } } + std::stable_sort( + HashedFuncs.begin(), HashedFuncs.end(), + [](const std::pair<FunctionComparator::FunctionHash, Function *> &a, + const std::pair<FunctionComparator::FunctionHash, Function *> &b) { + return a.first < b.first; + }); + + auto S = HashedFuncs.begin(); + for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) { + // If the hash value matches the previous value or the next one, we must + // consider merging it. Otherwise it is dropped and never considered again. + if ((I != S && std::prev(I)->first == I->first) || + (std::next(I) != IE && std::next(I)->first == I->first) ) { + Deferred.push_back(WeakVH(I->second)); + } + } + do { std::vector<WeakVH> Worklist; Deferred.swap(Worklist); @@ -1270,6 +1584,7 @@ bool MergeFunctions::runOnModule(Module &M) { } while (!Deferred.empty()); FnTree.clear(); + GlobalNumbers.clear(); return Changed; } @@ -1282,6 +1597,32 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { ++UI; CallSite CS(U->getUser()); if (CS && CS.isCallee(U)) { + // Transfer the called function's attributes to the call site. Due to the + // bitcast we will 'lose' ABI changing attributes because the 'called + // function' is no longer a Function* but the bitcast. Code that looks up + // the attributes from the called function will fail. + + // FIXME: This is not actually true, at least not anymore. The callsite + // will always have the same ABI affecting attributes as the callee, + // because otherwise the original input has UB. Note that Old and New + // always have matching ABI, so no attributes need to be changed. + // Transferring other attributes may help other optimizations, but that + // should be done uniformly and not in this ad-hoc way. + auto &Context = New->getContext(); + auto NewFuncAttrs = New->getAttributes(); + auto CallSiteAttrs = CS.getAttributes(); + + CallSiteAttrs = CallSiteAttrs.addAttributes( + Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes()); + + for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) { + AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx); + if (Attrs.getNumSlots()) + CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs); + } + + CS.setAttributes(CallSiteAttrs); + remove(CS.getInstruction()->getParent()->getParent()); U->set(BitcastNew); } @@ -1352,15 +1693,15 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { SmallVector<Value *, 16> Args; unsigned i = 0; FunctionType *FFTy = F->getFunctionType(); - for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end(); - AI != AE; ++AI) { - Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i))); + for (Argument & AI : NewG->args()) { + Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); ++i; } CallInst *CI = Builder.CreateCall(F, Args); CI->setTailCall(); CI->setCallingConv(F->getCallingConv()); + CI->setAttributes(F->getAttributes()); if (NewG->getReturnType()->isVoidTy()) { Builder.CreateRetVoid(); } else { @@ -1379,8 +1720,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { // Replace G with an alias to F and delete G. void MergeFunctions::writeAlias(Function *F, Function *G) { - PointerType *PTy = G->getType(); - auto *GA = GlobalAlias::create(PTy, G->getLinkage(), "", F); + auto *GA = GlobalAlias::create(G->getLinkage(), "", F); F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); GA->takeName(G); GA->setVisibility(G->getVisibility()); @@ -1425,19 +1765,24 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { ++NumFunctionsMerged; } -/// Replace function F for function G in the map. -void MergeFunctions::replaceFunctionInTree(FnTreeType::iterator &IterToF, +/// Replace function F by function G. +void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN, Function *G) { - Function *F = IterToF->getFunc(); - - // A total order is already guaranteed otherwise because we process strong - // functions before weak functions. - assert(((F->mayBeOverridden() && G->mayBeOverridden()) || - (!F->mayBeOverridden() && !G->mayBeOverridden())) && - "Only change functions if both are strong or both are weak"); - (void)F; - - IterToF->replaceBy(G); + Function *F = FN.getFunc(); + assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 && + "The two functions must be equal"); + + auto I = FNodesInTree.find(F); + assert(I != FNodesInTree.end() && "F should be in FNodesInTree"); + assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G"); + + FnTreeType::iterator IterToFNInFnTree = I->second; + assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree."); + // Remove F -> FN and insert G -> FN + FNodesInTree.erase(I); + FNodesInTree.insert({G, IterToFNInFnTree}); + // Replace F with G in FN, which is stored inside the FnTree. + FN.replaceBy(G); } // Insert a ComparableFunction into the FnTree, or merge it away if equal to one @@ -1447,6 +1792,8 @@ bool MergeFunctions::insert(Function *NewFunction) { FnTree.insert(FunctionNode(NewFunction)); if (Result.second) { + assert(FNodesInTree.count(NewFunction) == 0); + FNodesInTree.insert({NewFunction, Result.first}); DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n'); return false; } @@ -1476,7 +1823,7 @@ bool MergeFunctions::insert(Function *NewFunction) { if (OldF.getFunc()->getName() > NewFunction->getName()) { // Swap the two functions. Function *F = OldF.getFunc(); - replaceFunctionInTree(Result.first, NewFunction); + replaceFunctionInTree(*Result.first, NewFunction); NewFunction = F; assert(OldF.getFunc() != F && "Must have swapped the functions."); } @@ -1495,18 +1842,13 @@ bool MergeFunctions::insert(Function *NewFunction) { // Remove a function from FnTree. If it was already in FnTree, add // it to Deferred so that we'll look at it in the next round. void MergeFunctions::remove(Function *F) { - // We need to make sure we remove F, not a function "equal" to F per the - // function equality comparator. - FnTreeType::iterator found = FnTree.find(FunctionNode(F)); - size_t Erased = 0; - if (found != FnTree.end() && found->getFunc() == F) { - Erased = 1; - FnTree.erase(found); - } - - if (Erased) { - DEBUG(dbgs() << "Removed " << F->getName() - << " from set and deferred it.\n"); + auto I = FNodesInTree.find(F); + if (I != FNodesInTree.end()) { + DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n"); + FnTree.erase(I->second); + // I->second has been invalidated, remove it from the FNodesInTree map to + // preserve the invariant. + FNodesInTree.erase(I); Deferred.emplace_back(F); } } @@ -1516,6 +1858,8 @@ void MergeFunctions::remove(Function *F) { void MergeFunctions::removeUsers(Value *V) { std::vector<Value *> Worklist; Worklist.push_back(V); + SmallSet<Value*, 8> Visited; + Visited.insert(V); while (!Worklist.empty()) { Value *V = Worklist.back(); Worklist.pop_back(); @@ -1526,8 +1870,10 @@ void MergeFunctions::removeUsers(Value *V) { } else if (isa<GlobalValue>(U)) { // do nothing } else if (Constant *C = dyn_cast<Constant>(U)) { - for (User *UU : C->users()) - Worklist.push_back(UU); + for (User *UU : C->users()) { + if (!Visited.insert(UU).second) + Worklist.push_back(UU); + } } } } diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp index 4a7cb7ba7d12..0c5c84bbccab 100644 --- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -50,7 +50,7 @@ ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); } Function* PartialInliner::unswitchFunction(Function* F) { // First, verify that this function is an unswitching candidate... - BasicBlock* entryBlock = F->begin(); + BasicBlock *entryBlock = &F->front(); BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator()); if (!BR || BR->isUnconditional()) return nullptr; @@ -89,18 +89,18 @@ Function* PartialInliner::unswitchFunction(Function* F) { // of which will go outside. BasicBlock* preReturn = newReturnBlock; newReturnBlock = newReturnBlock->splitBasicBlock( - newReturnBlock->getFirstNonPHI()); + newReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = preReturn->begin(); - BasicBlock::iterator Ins = newReturnBlock->begin(); + Instruction *Ins = &newReturnBlock->front(); while (I != preReturn->end()) { PHINode* OldPhi = dyn_cast<PHINode>(I); if (!OldPhi) break; - - PHINode* retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins); + + PHINode *retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins); OldPhi->replaceAllUsesWith(retPhi); Ins = newReturnBlock->getFirstNonPHI(); - - retPhi->addIncoming(I, preReturn); + + retPhi->addIncoming(&*I, preReturn); retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock), newEntryBlock); OldPhi->removeIncomingValue(newEntryBlock); @@ -116,8 +116,8 @@ Function* PartialInliner::unswitchFunction(Function* F) { FE = duplicateFunction->end(); FI != FE; ++FI) if (&*FI != newEntryBlock && &*FI != newReturnBlock && &*FI != newNonReturnBlock) - toExtract.push_back(FI); - + toExtract.push_back(&*FI); + // The CodeExtractor needs a dominator tree. DominatorTree DT; DT.recalculate(*duplicateFunction); diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 909baae92548..9876efa7b235 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -12,19 +12,26 @@ // //===----------------------------------------------------------------------===// - #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm-c/Transforms/PassManagerBuilder.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CFLAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/Verifier.h" +#include "llvm/IR/FunctionInfo.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/Transforms/IPO/InferFunctionAttrs.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Vectorize.h" @@ -89,11 +96,21 @@ static cl::opt<bool> EnableLoopDistribute( "enable-loop-distribute", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopDistribution Pass")); +static cl::opt<bool> EnableNonLTOGlobalsModRef( + "enable-non-lto-gmr", cl::init(true), cl::Hidden, + cl::desc( + "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline.")); + +static cl::opt<bool> EnableLoopLoadElim( + "enable-loop-load-elim", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopLoadElimination Pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; LibraryInfo = nullptr; Inliner = nullptr; + FunctionIndex = nullptr; DisableUnitAtATime = false; DisableUnrollLoops = false; BBVectorize = RunBBVectorization; @@ -143,10 +160,9 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses( // BasicAliasAnalysis wins if they disagree. This is intended to help // support "obvious" type-punning idioms. if (UseCFLAA) - PM.add(createCFLAliasAnalysisPass()); - PM.add(createTypeBasedAliasAnalysisPass()); - PM.add(createScopedNoAliasAAPass()); - PM.add(createBasicAliasAnalysisPass()); + PM.add(createCFLAAWrapperPass()); + PM.add(createTypeBasedAAWrapperPass()); + PM.add(createScopedNoAliasAAWrapperPass()); } void PassManagerBuilder::populateFunctionPassManager( @@ -172,6 +188,9 @@ void PassManagerBuilder::populateFunctionPassManager( void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { + // Allow forcing function attributes as a debugging and tuning aid. + MPM.add(createForceFunctionAttrsLegacyPass()); + // If all optimizations are disabled, just run the always-inline pass and, // if enabled, the function merging pass. if (OptLevel == 0) { @@ -201,10 +220,15 @@ void PassManagerBuilder::populateModulePassManager( addInitialAliasAnalysisPasses(MPM); if (!DisableUnitAtATime) { + // Infer attributes about declarations if possible. + MPM.add(createInferFunctionAttrsLegacyPass()); + addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createGlobalOptimizerPass()); // Optimize out global vars + // Promote any localized global vars + MPM.add(createPromoteMemoryToRegisterPass()); MPM.add(createDeadArgEliminationPass()); // Dead argument elimination @@ -213,6 +237,12 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } + if (EnableNonLTOGlobalsModRef) + // We add a module alias analysis pass here. In part due to bugs in the + // analysis infrastructure this "works" in that the analysis stays alive + // for the entire SCC pass run below. + MPM.add(createGlobalsAAWrapperPass()); + // Start of CallGraph SCC passes. if (!DisableUnitAtATime) MPM.add(createPruneEHPass()); // Remove dead EH info @@ -245,6 +275,7 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + MPM.add(createCFGSimplificationPass()); MPM.add(createInstructionCombiningPass()); MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. @@ -315,9 +346,42 @@ void PassManagerBuilder::populateModulePassManager( // we must insert a no-op module pass to reset the pass manager. MPM.add(createBarrierNoopPass()); + if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO) { + // Remove avail extern fns and globals definitions if we aren't + // compiling an object file for later LTO. For LTO we want to preserve + // these so they are eligible for inlining at link-time. Note if they + // are unreferenced they will be removed by GlobalDCE later, so + // this only impacts referenced available externally globals. + // Eventually they will be suppressed during codegen, but eliminating + // here enables more opportunity for GlobalDCE as it may make + // globals referenced by available external functions dead + // and saves running remaining passes on the eliminated functions. + MPM.add(createEliminateAvailableExternallyPass()); + } + + if (EnableNonLTOGlobalsModRef) + // We add a fresh GlobalsModRef run at this point. This is particularly + // useful as the above will have inlined, DCE'ed, and function-attr + // propagated everything. We should at this point have a reasonably minimal + // and richly annotated call graph. By computing aliasing and mod/ref + // information for all local globals here, the late loop passes and notably + // the vectorizer will be able to use them to help recognize vectorizable + // memory operations. + // + // Note that this relies on a bug in the pass manager which preserves + // a module analysis into a function pass pipeline (and throughout it) so + // long as the first function pass doesn't invalidate the module analysis. + // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for + // this to work. Fortunately, it is trivial to preserve AliasAnalysis + // (doing nothing preserves it as it is required to be conservatively + // correct in the face of IR changes). + MPM.add(createGlobalsAAWrapperPass()); + if (RunFloat2Int) MPM.add(createFloat2IntPass()); + addExtensionsToPM(EP_VectorizerStart, MPM); + // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies // on the rotated form. Disable header duplication at -Oz. @@ -329,6 +393,12 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopDistributePass()); MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + if (EnableLoopLoadElim) + MPM.add(createLoopLoadEliminationPass()); + // FIXME: Because of #pragma vectorize enable, the passes below are always // inserted in the pipeline, even when the vectorizer doesn't run (ex. when // on -O1 and no #pragma is found). Would be good to have these two passes @@ -402,17 +472,6 @@ void PassManagerBuilder::populateModulePassManager( // GlobalOpt already deletes dead functions and globals, at -O2 try a // late pass of GlobalDCE. It is capable of deleting dead cycles. if (OptLevel > 1) { - if (!PrepareForLTO) { - // Remove avail extern fns and globals definitions if we aren't - // compiling an object file for later LTO. For LTO we want to preserve - // these so they are eligible for inlining at link-time. Note if they - // are unreferenced they will be removed by GlobalDCE below, so - // this only impacts referenced available externally globals. - // Eventually they will be suppressed during codegen, but eliminating - // here enables more opportunity for GlobalDCE as it may make - // globals referenced by available external functions dead. - MPM.add(createEliminateAvailableExternallyPass()); - } MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. MPM.add(createConstantMergePass()); // Merge dup global constants } @@ -428,13 +487,25 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Provide AliasAnalysis services for optimizations. addInitialAliasAnalysisPasses(PM); + if (FunctionIndex) + PM.add(createFunctionImportPass(FunctionIndex)); + + // Allow forcing function attributes as a debugging and tuning aid. + PM.add(createForceFunctionAttrsLegacyPass()); + + // Infer attributes about declarations if possible. + PM.add(createInferFunctionAttrsLegacyPass()); + // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. PM.add(createIPSCCPPass()); // Now that we internalized some globals, see if we can hack on them! + PM.add(createFunctionAttrsPass()); // Add norecurse if possible. PM.add(createGlobalOptimizerPass()); + // Promote any localized global vars. + PM.add(createPromoteMemoryToRegisterPass()); // Linking modules together can lead to duplicated global constants, only // keep one copy of each constant. @@ -481,7 +552,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Run a few AA driven optimizations here and now, to cleanup the code. PM.add(createFunctionAttrsPass()); // Add nocapture. - PM.add(createGlobalsModRefPass()); // IP alias analysis. + PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. PM.add(createLICMPass()); // Hoist loop invariants. if (EnableMLSM) @@ -500,6 +571,15 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createLoopVectorizePass(true, LoopVectorize)); + // Now that we've optimized loops (in particular loop induction variables), + // we may have exposed more scalar opportunities. Run parts of the scalar + // optimizer again at this point. + PM.add(createInstructionCombiningPass()); // Initial cleanup + PM.add(createCFGSimplificationPass()); // if-convert + PM.add(createSCCPPass()); // Propagate exposed constants + PM.add(createInstructionCombiningPass()); // Clean up again + PM.add(createBitTrackingDCEPass()); + // More scalar chains could be vectorized due to more alias information if (RunSLPAfterLoopVectorization) if (SLPVectorize) @@ -524,6 +604,9 @@ void PassManagerBuilder::addLateLTOOptimizationPasses( // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass()); + // Drop bodies of available externally objects to improve GlobalDCE. + PM.add(createEliminateAvailableExternallyPass()); + // Now that we have optimized the program, discard unreachable functions. PM.add(createGlobalDCEPass()); @@ -543,6 +626,10 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (OptLevel > 1) addLTOOptimizationPasses(PM); + // Create a function that performs CFI checks for cross-DSO calls with targets + // in the current module. + PM.add(createCrossDSOCFIPass()); + // Lower bit sets to globals. This pass supports Clang's control flow // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI // is enabled. The pass does nothing if CFI is disabled. diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index b2f1010c9a07..3af4afb903fe 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -21,7 +21,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -153,21 +153,16 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) { // If the SCC doesn't unwind or doesn't throw, note this fact. if (!SCCMightUnwind || !SCCMightReturn) for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - AttrBuilder NewAttributes; - - if (!SCCMightUnwind) - NewAttributes.addAttribute(Attribute::NoUnwind); - if (!SCCMightReturn) - NewAttributes.addAttribute(Attribute::NoReturn); - Function *F = (*I)->getFunction(); - const AttributeSet &PAL = F->getAttributes().getFnAttributes(); - const AttributeSet &NPAL = AttributeSet::get( - F->getContext(), AttributeSet::FunctionIndex, NewAttributes); - if (PAL != NPAL) { + if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) { + F->addFnAttr(Attribute::NoUnwind); + MadeChange = true; + } + + if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) { + F->addFnAttr(Attribute::NoReturn); MadeChange = true; - F->addAttributes(AttributeSet::FunctionIndex, NPAL); } } @@ -191,9 +186,13 @@ bool PruneEH::SimplifyFunction(Function *F) { for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); + SmallVector<OperandBundleDef, 1> OpBundles; + II->getOperandBundlesAsDefs(OpBundles); + // Insert a call instruction before the invoke. - CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); + CallInst *Call = CallInst::Create(II->getCalledValue(), Args, OpBundles, + "", II); Call->takeName(II); Call->setCallingConv(II->getCallingConv()); Call->setAttributes(II->getAttributes()); @@ -233,7 +232,7 @@ bool PruneEH::SimplifyFunction(Function *F) { // Remove the uncond branch and add an unreachable. BB->getInstList().pop_back(); - new UnreachableInst(BB->getContext(), BB); + new UnreachableInst(BB->getContext(), &*BB); DeleteBasicBlock(New); // Delete the new BB. MadeChange = true; diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp index c8dfa54a4aa0..928d92ef9d12 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -22,7 +22,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -44,7 +43,11 @@ #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h" #include <cctype> using namespace llvm; @@ -61,27 +64,51 @@ static cl::opt<unsigned> SampleProfileMaxPropagateIterations( "sample-profile-max-propagate-iterations", cl::init(100), cl::desc("Maximum number of iterations to go through when propagating " "sample block/edge weights through the CFG.")); +static cl::opt<unsigned> SampleProfileRecordCoverage( + "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), + cl::desc("Emit a warning if less than N% of records in the input profile " + "are matched to the IR.")); +static cl::opt<unsigned> SampleProfileSampleCoverage( + "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), + cl::desc("Emit a warning if less than N% of samples in the input profile " + "are matched to the IR.")); +static cl::opt<double> SampleProfileHotThreshold( + "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"), + cl::desc("Inlined functions that account for more than N% of all samples " + "collected in the parent function, will be inlined again.")); +static cl::opt<double> SampleProfileGlobalHotThreshold( + "sample-profile-global-hot-threshold", cl::init(30), cl::value_desc("N"), + cl::desc("Top-level functions that account for more than N% of all samples " + "collected in the profile, will be marked as hot for the inliner " + "to consider.")); +static cl::opt<double> SampleProfileGlobalColdThreshold( + "sample-profile-global-cold-threshold", cl::init(0.5), cl::value_desc("N"), + cl::desc("Top-level functions that account for less than N% of all samples " + "collected in the profile, will be marked as cold for the inliner " + "to consider.")); namespace { -typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; -typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; -typedef std::pair<BasicBlock *, BasicBlock *> Edge; -typedef DenseMap<Edge, unsigned> EdgeWeightMap; -typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; +typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap; +typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap; +typedef std::pair<const BasicBlock *, const BasicBlock *> Edge; +typedef DenseMap<Edge, uint64_t> EdgeWeightMap; +typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>> + BlockEdgeMap; /// \brief Sample profile pass. /// /// This pass reads profile data from the file specified by /// -sample-profile-file and annotates every affected function with the /// profile information found in that file. -class SampleProfileLoader : public FunctionPass { +class SampleProfileLoader : public ModulePass { public: // Class identification, replacement for typeinfo static char ID; SampleProfileLoader(StringRef Name = SampleProfileFile) - : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr), - Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) { + : ModulePass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Reader(), + Samples(nullptr), Filename(Name), ProfileIsValid(false), + TotalCollectedSamples(0) { initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); } @@ -91,36 +118,37 @@ public: const char *getPassName() const override { return "Sample profile pass"; } - bool runOnFunction(Function &F) override; + bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTree>(); } protected: + bool runOnFunction(Function &F); unsigned getFunctionLoc(Function &F); bool emitAnnotations(Function &F); - unsigned getInstWeight(Instruction &I); - unsigned getBlockWeight(BasicBlock *BB); + ErrorOr<uint64_t> getInstWeight(const Instruction &I) const; + ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB) const; + const FunctionSamples *findCalleeFunctionSamples(const CallInst &I) const; + const FunctionSamples *findFunctionSamples(const Instruction &I) const; + bool inlineHotFunctions(Function &F); + bool emitInlineHints(Function &F); void printEdgeWeight(raw_ostream &OS, Edge E); - void printBlockWeight(raw_ostream &OS, BasicBlock *BB); - void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); + void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; + void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); bool computeBlockWeights(Function &F); void findEquivalenceClasses(Function &F); void findEquivalencesFor(BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, DominatorTreeBase<BasicBlock> *DomTree); void propagateWeights(Function &F); - unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); + uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); bool propagateThroughEdges(Function &F); - - /// \brief Line number for the function header. Used to compute absolute - /// line numbers from the relative line numbers found in the profile. - unsigned HeaderLineno; + void computeDominanceAndLoopInfo(Function &F); + unsigned getOffset(unsigned L, unsigned H) const; + void clearFunctionData(); /// \brief Map basic blocks to their computed weights. /// @@ -135,7 +163,7 @@ protected: EdgeWeightMap EdgeWeights; /// \brief Set of visited blocks during propagation. - SmallPtrSet<BasicBlock *, 128> VisitedBlocks; + SmallPtrSet<const BasicBlock *, 128> VisitedBlocks; /// \brief Set of visited edges during propagation. SmallSet<Edge, 128> VisitedEdges; @@ -149,9 +177,9 @@ protected: EquivalenceClassMap EquivalenceClass; /// \brief Dominance, post-dominance and loop information. - DominatorTree *DT; - PostDominatorTree *PDT; - LoopInfo *LI; + std::unique_ptr<DominatorTree> DT; + std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT; + std::unique_ptr<LoopInfo> LI; /// \brief Predecessors for each basic block in the CFG. BlockEdgeMap Predecessors; @@ -159,9 +187,6 @@ protected: /// \brief Successors for each basic block in the CFG. BlockEdgeMap Successors; - /// \brief LLVM context holding the debug data we need. - LLVMContext *Ctx; - /// \brief Profile reader object. std::unique_ptr<SampleProfileReader> Reader; @@ -173,7 +198,207 @@ protected: /// \brief Flag indicating whether the profile input loaded successfully. bool ProfileIsValid; + + /// \brief Total number of samples collected in this profile. + /// + /// This is the sum of all the samples collected in all the functions executed + /// at runtime. + uint64_t TotalCollectedSamples; }; + +class SampleCoverageTracker { +public: + SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {} + + bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, + uint32_t Discriminator, uint64_t Samples); + unsigned computeCoverage(unsigned Used, unsigned Total) const; + unsigned countUsedRecords(const FunctionSamples *FS) const; + unsigned countBodyRecords(const FunctionSamples *FS) const; + uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } + uint64_t countBodySamples(const FunctionSamples *FS) const; + void clear() { + SampleCoverage.clear(); + TotalUsedSamples = 0; + } + +private: + typedef std::map<LineLocation, unsigned> BodySampleCoverageMap; + typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap> + FunctionSamplesCoverageMap; + + /// Coverage map for sampling records. + /// + /// This map keeps a record of sampling records that have been matched to + /// an IR instruction. This is used to detect some form of staleness in + /// profiles (see flag -sample-profile-check-coverage). + /// + /// Each entry in the map corresponds to a FunctionSamples instance. This is + /// another map that counts how many times the sample record at the + /// given location has been used. + FunctionSamplesCoverageMap SampleCoverage; + + /// Number of samples used from the profile. + /// + /// When a sampling record is used for the first time, the samples from + /// that record are added to this accumulator. Coverage is later computed + /// based on the total number of samples available in this function and + /// its callsites. + /// + /// Note that this accumulator tracks samples used from a single function + /// and all the inlined callsites. Strictly, we should have a map of counters + /// keyed by FunctionSamples pointers, but these stats are cleared after + /// every function, so we just need to keep a single counter. + uint64_t TotalUsedSamples; +}; + +SampleCoverageTracker CoverageTracker; + +/// Return true if the given callsite is hot wrt to its caller. +/// +/// Functions that were inlined in the original binary will be represented +/// in the inline stack in the sample profile. If the profile shows that +/// the original inline decision was "good" (i.e., the callsite is executed +/// frequently), then we will recreate the inline decision and apply the +/// profile from the inlined callsite. +/// +/// To decide whether an inlined callsite is hot, we compute the fraction +/// of samples used by the callsite with respect to the total number of samples +/// collected in the caller. +/// +/// If that fraction is larger than the default given by +/// SampleProfileHotThreshold, the callsite will be inlined again. +bool callsiteIsHot(const FunctionSamples *CallerFS, + const FunctionSamples *CallsiteFS) { + if (!CallsiteFS) + return false; // The callsite was not inlined in the original binary. + + uint64_t ParentTotalSamples = CallerFS->getTotalSamples(); + if (ParentTotalSamples == 0) + return false; // Avoid division by zero. + + uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); + if (CallsiteTotalSamples == 0) + return false; // Callsite is trivially cold. + + double PercentSamples = + (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0; + return PercentSamples >= SampleProfileHotThreshold; +} + +} + +/// Mark as used the sample record for the given function samples at +/// (LineOffset, Discriminator). +/// +/// \returns true if this is the first time we mark the given record. +bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, + uint32_t LineOffset, + uint32_t Discriminator, + uint64_t Samples) { + LineLocation Loc(LineOffset, Discriminator); + unsigned &Count = SampleCoverage[FS][Loc]; + bool FirstTime = (++Count == 1); + if (FirstTime) + TotalUsedSamples += Samples; + return FirstTime; +} + +/// Return the number of sample records that were applied from this profile. +/// +/// This count does not include records from cold inlined callsites. +unsigned +SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { + auto I = SampleCoverage.find(FS); + + // The size of the coverage map for FS represents the number of records + // that were marked used at least once. + unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0; + + // If there are inlined callsites in this function, count the samples found + // in the respective bodies. However, do not bother counting callees with 0 + // total samples, these are callees that were never invoked at runtime. + for (const auto &I : FS->getCallsiteSamples()) { + const FunctionSamples *CalleeSamples = &I.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countUsedRecords(CalleeSamples); + } + + return Count; +} + +/// Return the number of sample records in the body of this profile. +/// +/// This count does not include records from cold inlined callsites. +unsigned +SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { + unsigned Count = FS->getBodySamples().size(); + + // Only count records in hot callsites. + for (const auto &I : FS->getCallsiteSamples()) { + const FunctionSamples *CalleeSamples = &I.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countBodyRecords(CalleeSamples); + } + + return Count; +} + +/// Return the number of samples collected in the body of this profile. +/// +/// This count does not include samples from cold inlined callsites. +uint64_t +SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { + uint64_t Total = 0; + for (const auto &I : FS->getBodySamples()) + Total += I.second.getSamples(); + + // Only count samples in hot callsites. + for (const auto &I : FS->getCallsiteSamples()) { + const FunctionSamples *CalleeSamples = &I.second; + if (callsiteIsHot(FS, CalleeSamples)) + Total += countBodySamples(CalleeSamples); + } + + return Total; +} + +/// Return the fraction of sample records used in this profile. +/// +/// The returned value is an unsigned integer in the range 0-100 indicating +/// the percentage of sample records that were used while applying this +/// profile to the associated function. +unsigned SampleCoverageTracker::computeCoverage(unsigned Used, + unsigned Total) const { + assert(Used <= Total && + "number of used records cannot exceed the total number of records"); + return Total > 0 ? Used * 100 / Total : 100; +} + +/// Clear all the per-function data used to load samples and propagate weights. +void SampleProfileLoader::clearFunctionData() { + BlockWeights.clear(); + EdgeWeights.clear(); + VisitedBlocks.clear(); + VisitedEdges.clear(); + EquivalenceClass.clear(); + DT = nullptr; + PDT = nullptr; + LI = nullptr; + Predecessors.clear(); + Successors.clear(); + CoverageTracker.clear(); +} + +/// \brief Returns the offset of lineno \p L to head_lineno \p H +/// +/// \param L Lineno +/// \param H Header lineno of the function +/// +/// \returns offset to the header lineno. 16 bits are used to represent offset. +/// We assume that a single function will not exceed 65535 LOC. +unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const { + return (L - H) & 0xffff; } /// \brief Print the weight of edge \p E on stream \p OS. @@ -190,8 +415,8 @@ void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { /// \param OS Stream to emit the output to. /// \param BB Block to print. void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, - BasicBlock *BB) { - BasicBlock *Equiv = EquivalenceClass[BB]; + const BasicBlock *BB) { + const BasicBlock *Equiv = EquivalenceClass[BB]; OS << "equivalence[" << BB->getName() << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; } @@ -200,8 +425,11 @@ void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, /// /// \param OS Stream to emit the output to. /// \param BB Block to print. -void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { - OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; +void SampleProfileLoader::printBlockWeight(raw_ostream &OS, + const BasicBlock *BB) const { + const auto &I = BlockWeights.find(BB); + uint64_t W = (I == BlockWeights.end() ? 0 : I->second); + OS << "weight[" << BB->getName() << "]: " << W << "\n"; } /// \brief Get the weight for an instruction. @@ -214,51 +442,67 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { /// /// \param Inst Instruction to query. /// -/// \returns The profiled weight of I. -unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { +/// \returns the weight of \p Inst. +ErrorOr<uint64_t> +SampleProfileLoader::getInstWeight(const Instruction &Inst) const { DebugLoc DLoc = Inst.getDebugLoc(); if (!DLoc) - return 0; + return std::error_code(); - unsigned Lineno = DLoc.getLine(); - if (Lineno < HeaderLineno) - return 0; + const FunctionSamples *FS = findFunctionSamples(Inst); + if (!FS) + return std::error_code(); const DILocation *DIL = DLoc; - int LOffset = Lineno - HeaderLineno; - unsigned Discriminator = DIL->getDiscriminator(); - unsigned Weight = Samples->samplesAt(LOffset, Discriminator); - DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst - << " (line offset: " << LOffset << "." << Discriminator - << " - weight: " << Weight << ")\n"); - return Weight; + unsigned Lineno = DLoc.getLine(); + unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine(); + + uint32_t LineOffset = getOffset(Lineno, HeaderLineno); + uint32_t Discriminator = DIL->getDiscriminator(); + ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); + if (R) { + bool FirstMark = + CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); + if (FirstMark) { + const Function *F = Inst.getParent()->getParent(); + LLVMContext &Ctx = F->getContext(); + emitOptimizationRemark( + Ctx, DEBUG_TYPE, *F, DLoc, + Twine("Applied ") + Twine(*R) + " samples from profile (offset: " + + Twine(LineOffset) + + ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")"); + } + DEBUG(dbgs() << " " << Lineno << "." << DIL->getDiscriminator() << ":" + << Inst << " (line offset: " << Lineno - HeaderLineno << "." + << DIL->getDiscriminator() << " - weight: " << R.get() + << ")\n"); + } + return R; } /// \brief Compute the weight of a basic block. /// /// The weight of basic block \p BB is the maximum weight of all the -/// instructions in BB. The weight of \p BB is computed and cached in -/// the BlockWeights map. +/// instructions in BB. /// /// \param BB The basic block to query. /// -/// \returns The computed weight of BB. -unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { - // If we've computed BB's weight before, return it. - std::pair<BlockWeightMap::iterator, bool> Entry = - BlockWeights.insert(std::make_pair(BB, 0)); - if (!Entry.second) - return Entry.first->second; - - // Otherwise, compute and cache BB's weight. - unsigned Weight = 0; +/// \returns the weight for \p BB. +ErrorOr<uint64_t> +SampleProfileLoader::getBlockWeight(const BasicBlock *BB) const { + bool Found = false; + uint64_t Weight = 0; for (auto &I : BB->getInstList()) { - unsigned InstWeight = getInstWeight(I); - if (InstWeight > Weight) - Weight = InstWeight; + const ErrorOr<uint64_t> &R = getInstWeight(I); + if (R && R.get() >= Weight) { + Weight = R.get(); + Found = true; + } } - Entry.first->second = Weight; - return Weight; + if (Found) + return Weight; + else + return std::error_code(); } /// \brief Compute and store the weights of every basic block. @@ -270,15 +514,199 @@ unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { bool SampleProfileLoader::computeBlockWeights(Function &F) { bool Changed = false; DEBUG(dbgs() << "Block weights\n"); - for (auto &BB : F) { - unsigned Weight = getBlockWeight(&BB); - Changed |= (Weight > 0); + for (const auto &BB : F) { + ErrorOr<uint64_t> Weight = getBlockWeight(&BB); + if (Weight) { + BlockWeights[&BB] = Weight.get(); + VisitedBlocks.insert(&BB); + Changed = true; + } DEBUG(printBlockWeight(dbgs(), &BB)); } return Changed; } +/// \brief Get the FunctionSamples for a call instruction. +/// +/// The FunctionSamples of a call instruction \p Inst is the inlined +/// instance in which that call instruction is calling to. It contains +/// all samples that resides in the inlined instance. We first find the +/// inlined instance in which the call instruction is from, then we +/// traverse its children to find the callsite with the matching +/// location and callee function name. +/// +/// \param Inst Call instruction to query. +/// +/// \returns The FunctionSamples pointer to the inlined instance. +const FunctionSamples * +SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const { + const DILocation *DIL = Inst.getDebugLoc(); + if (!DIL) { + return nullptr; + } + DISubprogram *SP = DIL->getScope()->getSubprogram(); + if (!SP) + return nullptr; + + Function *CalleeFunc = Inst.getCalledFunction(); + if (!CalleeFunc) { + return nullptr; + } + + StringRef CalleeName = CalleeFunc->getName(); + const FunctionSamples *FS = findFunctionSamples(Inst); + if (FS == nullptr) + return nullptr; + + return FS->findFunctionSamplesAt( + CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()), + DIL->getDiscriminator(), CalleeName)); +} + +/// \brief Get the FunctionSamples for an instruction. +/// +/// The FunctionSamples of an instruction \p Inst is the inlined instance +/// in which that instruction is coming from. We traverse the inline stack +/// of that instruction, and match it with the tree nodes in the profile. +/// +/// \param Inst Instruction to query. +/// +/// \returns the FunctionSamples pointer to the inlined instance. +const FunctionSamples * +SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { + SmallVector<CallsiteLocation, 10> S; + const DILocation *DIL = Inst.getDebugLoc(); + if (!DIL) { + return Samples; + } + StringRef CalleeName; + for (const DILocation *DIL = Inst.getDebugLoc(); DIL; + DIL = DIL->getInlinedAt()) { + DISubprogram *SP = DIL->getScope()->getSubprogram(); + if (!SP) + return nullptr; + if (!CalleeName.empty()) { + S.push_back(CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()), + DIL->getDiscriminator(), CalleeName)); + } + CalleeName = SP->getLinkageName(); + } + if (S.size() == 0) + return Samples; + const FunctionSamples *FS = Samples; + for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { + FS = FS->findFunctionSamplesAt(S[i]); + } + return FS; +} + +/// \brief Emit an inline hint if \p F is globally hot or cold. +/// +/// If \p F consumes a significant fraction of samples (indicated by +/// SampleProfileGlobalHotThreshold), apply the InlineHint attribute for the +/// inliner to consider the function hot. +/// +/// If \p F consumes a small fraction of samples (indicated by +/// SampleProfileGlobalColdThreshold), apply the Cold attribute for the inliner +/// to consider the function cold. +/// +/// FIXME - This setting of inline hints is sub-optimal. Instead of marking a +/// function globally hot or cold, we should be annotating individual callsites. +/// This is not currently possible, but work on the inliner will eventually +/// provide this ability. See http://reviews.llvm.org/D15003 for details and +/// discussion. +/// +/// \returns True if either attribute was applied to \p F. +bool SampleProfileLoader::emitInlineHints(Function &F) { + if (TotalCollectedSamples == 0) + return false; + + uint64_t FunctionSamples = Samples->getTotalSamples(); + double SamplesPercent = + (double)FunctionSamples / (double)TotalCollectedSamples * 100.0; + + // If the function collected more samples than the hot threshold, mark + // it globally hot. + if (SamplesPercent >= SampleProfileGlobalHotThreshold) { + F.addFnAttr(llvm::Attribute::InlineHint); + std::string Msg; + raw_string_ostream S(Msg); + S << "Applied inline hint to globally hot function '" << F.getName() + << "' with " << format("%.2f", SamplesPercent) + << "% of samples (threshold: " + << format("%.2f", SampleProfileGlobalHotThreshold.getValue()) << "%)"; + S.flush(); + emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg); + return true; + } + + // If the function collected fewer samples than the cold threshold, mark + // it globally cold. + if (SamplesPercent <= SampleProfileGlobalColdThreshold) { + F.addFnAttr(llvm::Attribute::Cold); + std::string Msg; + raw_string_ostream S(Msg); + S << "Applied cold hint to globally cold function '" << F.getName() + << "' with " << format("%.2f", SamplesPercent) + << "% of samples (threshold: " + << format("%.2f", SampleProfileGlobalColdThreshold.getValue()) << "%)"; + S.flush(); + emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg); + return true; + } + + return false; +} + +/// \brief Iteratively inline hot callsites of a function. +/// +/// Iteratively traverse all callsites of the function \p F, and find if +/// the corresponding inlined instance exists and is hot in profile. If +/// it is hot enough, inline the callsites and adds new callsites of the +/// callee into the caller. +/// +/// TODO: investigate the possibility of not invoking InlineFunction directly. +/// +/// \param F function to perform iterative inlining. +/// +/// \returns True if there is any inline happened. +bool SampleProfileLoader::inlineHotFunctions(Function &F) { + bool Changed = false; + LLVMContext &Ctx = F.getContext(); + while (true) { + bool LocalChanged = false; + SmallVector<CallInst *, 10> CIS; + for (auto &BB : F) { + for (auto &I : BB.getInstList()) { + CallInst *CI = dyn_cast<CallInst>(&I); + if (CI && callsiteIsHot(Samples, findCalleeFunctionSamples(*CI))) + CIS.push_back(CI); + } + } + for (auto CI : CIS) { + InlineFunctionInfo IFI; + Function *CalledFunction = CI->getCalledFunction(); + DebugLoc DLoc = CI->getDebugLoc(); + uint64_t NumSamples = findCalleeFunctionSamples(*CI)->getTotalSamples(); + if (InlineFunction(CI, IFI)) { + LocalChanged = true; + emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc, + Twine("inlined hot callee '") + + CalledFunction->getName() + "' with " + + Twine(NumSamples) + " samples into '" + + F.getName() + "'"); + } + } + if (LocalChanged) { + Changed = true; + } else { + break; + } + } + return Changed; +} + /// \brief Find equivalence classes for the given block. /// /// This finds all the blocks that are guaranteed to execute the same @@ -305,12 +733,13 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) { void SampleProfileLoader::findEquivalencesFor( BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, DominatorTreeBase<BasicBlock> *DomTree) { - for (auto *BB2 : Descendants) { + const BasicBlock *EC = EquivalenceClass[BB1]; + uint64_t Weight = BlockWeights[EC]; + for (const auto *BB2 : Descendants) { bool IsDomParent = DomTree->dominates(BB2, BB1); bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); - if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent && - IsInSameLoop) { - EquivalenceClass[BB2] = BB1; + if (BB1 != BB2 && IsDomParent && IsInSameLoop) { + EquivalenceClass[BB2] = EC; // If BB2 is heavier than BB1, make BB2 have the same weight // as BB1. @@ -320,11 +749,10 @@ void SampleProfileLoader::findEquivalencesFor( // during the propagation phase. Right now, we just want to // make sure that BB1 has the largest weight of all the // members of its equivalence set. - unsigned &BB1Weight = BlockWeights[BB1]; - unsigned &BB2Weight = BlockWeights[BB2]; - BB1Weight = std::max(BB1Weight, BB2Weight); + Weight = std::max(Weight, BlockWeights[BB2]); } } + BlockWeights[EC] = Weight; } /// \brief Find equivalence classes. @@ -364,19 +792,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) { // class by making BB2's equivalence class be BB1. DominatedBBs.clear(); DT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, PDT->DT); - - // Repeat the same logic for all the blocks post-dominated by BB1. - // We are looking for every basic block BB2 such that: - // - // 1- BB1 post-dominates BB2. - // 2- BB2 dominates BB1. - // 3- BB1 and BB2 are in the same loop nest. - // - // If all those conditions hold, BB2's equivalence class is BB1. - DominatedBBs.clear(); - PDT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, DT); + findEquivalencesFor(BB1, DominatedBBs, PDT.get()); DEBUG(printBlockEquivalence(dbgs(), BB1)); } @@ -389,8 +805,8 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) { // to all the blocks in that equivalence class. DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); for (auto &BI : F) { - BasicBlock *BB = &BI; - BasicBlock *EquivBB = EquivalenceClass[BB]; + const BasicBlock *BB = &BI; + const BasicBlock *EquivBB = EquivalenceClass[BB]; if (BB != EquivBB) BlockWeights[BB] = BlockWeights[EquivBB]; DEBUG(printBlockWeight(dbgs(), BB)); @@ -407,7 +823,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) { /// \param UnknownEdge Set if E has not been visited before. /// /// \returns E's weight, if known. Otherwise, return 0. -unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, +uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge) { if (!VisitedEdges.count(E)) { (*NumUnknownEdges)++; @@ -432,8 +848,9 @@ unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, bool SampleProfileLoader::propagateThroughEdges(Function &F) { bool Changed = false; DEBUG(dbgs() << "\nPropagation through edges\n"); - for (auto &BI : F) { - BasicBlock *BB = &BI; + for (const auto &BI : F) { + const BasicBlock *BB = &BI; + const BasicBlock *EC = EquivalenceClass[BB]; // Visit all the predecessor and successor edges to determine // which ones have a weight assigned already. Note that it doesn't @@ -441,7 +858,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { // only case we are interested in handling is when only a single // edge is unknown (see setEdgeOrBlockWeight). for (unsigned i = 0; i < 2; i++) { - unsigned TotalWeight = 0; + uint64_t TotalWeight = 0; unsigned NumUnknownEdges = 0; Edge UnknownEdge, SelfReferentialEdge; @@ -485,7 +902,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { // all edges will get a weight, or iteration will stop when // it reaches SampleProfileMaxPropagateIterations. if (NumUnknownEdges <= 1) { - unsigned &BBWeight = BlockWeights[BB]; + uint64_t &BBWeight = BlockWeights[EC]; if (NumUnknownEdges == 0) { // If we already know the weight of all edges, the weight of the // basic block can be computed. It should be no larger than the sum @@ -497,9 +914,9 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { << " known. Set weight for block: "; printBlockWeight(dbgs(), BB);); } - if (VisitedBlocks.insert(BB).second) + if (VisitedBlocks.insert(EC).second) Changed = true; - } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { + } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) { // If there is a single unknown edge and the block has been // visited, then we can compute E's weight. if (BBWeight >= TotalWeight) @@ -511,8 +928,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { DEBUG(dbgs() << "Set weight for edge: "; printEdgeWeight(dbgs(), UnknownEdge)); } - } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) { - unsigned &BBWeight = BlockWeights[BB]; + } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) { + uint64_t &BBWeight = BlockWeights[BB]; // We have a self-referential edge and the weight of BB is known. if (BBWeight >= TotalWeight) EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; @@ -578,7 +995,7 @@ void SampleProfileLoader::buildEdges(Function &F) { /// known). void SampleProfileLoader::propagateWeights(Function &F) { bool Changed = true; - unsigned i = 0; + unsigned I = 0; // Add an entry count to the function using the samples gathered // at the function entry. @@ -592,14 +1009,15 @@ void SampleProfileLoader::propagateWeights(Function &F) { buildEdges(F); // Propagate until we converge or we go past the iteration limit. - while (Changed && i++ < SampleProfileMaxPropagateIterations) { + while (Changed && I++ < SampleProfileMaxPropagateIterations) { Changed = propagateThroughEdges(F); } // Generate MD_prof metadata for every branch instruction using the // edge weights computed during propagation. DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); - MDBuilder MDB(F.getContext()); + LLVMContext &Ctx = F.getContext(); + MDBuilder MDB(Ctx); for (auto &BI : F) { BasicBlock *BB = &BI; TerminatorInst *TI = BB->getTerminator(); @@ -610,24 +1028,44 @@ void SampleProfileLoader::propagateWeights(Function &F) { DEBUG(dbgs() << "\nGetting weights for branch at line " << TI->getDebugLoc().getLine() << ".\n"); - SmallVector<unsigned, 4> Weights; - bool AllWeightsZero = true; + SmallVector<uint32_t, 4> Weights; + uint32_t MaxWeight = 0; + DebugLoc MaxDestLoc; for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); Edge E = std::make_pair(BB, Succ); - unsigned Weight = EdgeWeights[E]; + uint64_t Weight = EdgeWeights[E]; DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); - Weights.push_back(Weight); - if (Weight != 0) - AllWeightsZero = false; + // Use uint32_t saturated arithmetic to adjust the incoming weights, + // if needed. Sample counts in profiles are 64-bit unsigned values, + // but internally branch weights are expressed as 32-bit values. + if (Weight > std::numeric_limits<uint32_t>::max()) { + DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); + Weight = std::numeric_limits<uint32_t>::max(); + } + Weights.push_back(static_cast<uint32_t>(Weight)); + if (Weight != 0) { + if (Weight > MaxWeight) { + MaxWeight = Weight; + MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc(); + } + } } // Only set weights if there is at least one non-zero weight. // In any other case, let the analyzer set weights. - if (!AllWeightsZero) { + if (MaxWeight > 0) { DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + DebugLoc BranchLoc = TI->getDebugLoc(); + emitOptimizationRemark( + Ctx, DEBUG_TYPE, F, MaxDestLoc, + Twine("most popular destination for conditional branches at ") + + ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" + + Twine(BranchLoc.getLine()) + ":" + + Twine(BranchLoc.getCol())) + : Twine("<UNKNOWN LOCATION>"))); } else { DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); } @@ -649,7 +1087,7 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) { if (DISubprogram *S = getDISubprogram(&F)) return S->getLine(); - // If could not find the start of \p F, emit a diagnostic to inform the user + // If the start of \p F is missing, emit a diagnostic to inform the user // about the missed opportunity. F.getContext().diagnose(DiagnosticInfoSampleProfile( "No debug information found in function " + F.getName() + @@ -658,6 +1096,17 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) { return 0; } +void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { + DT.reset(new DominatorTree); + DT->recalculate(F); + + PDT.reset(new DominatorTreeBase<BasicBlock>(true)); + PDT->recalculate(F); + + LI.reset(new LoopInfo); + LI->analyze(*DT); +} + /// \brief Generate branch weight metadata for all branches in \p F. /// /// Branch weights are computed out of instruction samples using a @@ -710,18 +1159,23 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) { bool SampleProfileLoader::emitAnnotations(Function &F) { bool Changed = false; - // Initialize invariants used during computation and propagation. - HeaderLineno = getFunctionLoc(F); - if (HeaderLineno == 0) + if (getFunctionLoc(F) == 0) return false; DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() - << ": " << HeaderLineno << "\n"); + << ": " << getFunctionLoc(F) << "\n"); + + Changed |= emitInlineHints(F); + + Changed |= inlineHotFunctions(F); // Compute basic block weights. Changed |= computeBlockWeights(F); if (Changed) { + // Compute dominance and loop info needed for propagation. + computeDominanceAndLoopInfo(F); + // Find equivalence classes. findEquivalenceClasses(F); @@ -729,24 +1183,48 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { propagateWeights(F); } + // If coverage checking was requested, compute it now. + if (SampleProfileRecordCoverage) { + unsigned Used = CoverageTracker.countUsedRecords(Samples); + unsigned Total = CoverageTracker.countBodyRecords(Samples); + unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); + if (Coverage < SampleProfileRecordCoverage) { + F.getContext().diagnose(DiagnosticInfoSampleProfile( + getDISubprogram(&F)->getFilename(), getFunctionLoc(F), + Twine(Used) + " of " + Twine(Total) + " available profile records (" + + Twine(Coverage) + "%) were applied", + DS_Warning)); + } + } + + if (SampleProfileSampleCoverage) { + uint64_t Used = CoverageTracker.getTotalUsedSamples(); + uint64_t Total = CoverageTracker.countBodySamples(Samples); + unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); + if (Coverage < SampleProfileSampleCoverage) { + F.getContext().diagnose(DiagnosticInfoSampleProfile( + getDISubprogram(&F)->getFilename(), getFunctionLoc(F), + Twine(Used) + " of " + Twine(Total) + " available profile samples (" + + Twine(Coverage) + "%) were applied", + DS_Warning)); + } + } return Changed; } char SampleProfileLoader::ID = 0; INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) bool SampleProfileLoader::doInitialization(Module &M) { - auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext()); + auto &Ctx = M.getContext(); + auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx); if (std::error_code EC = ReaderOrErr.getError()) { std::string Msg = "Could not open profile: " + EC.message(); - M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); + Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); return false; } Reader = std::move(ReaderOrErr.get()); @@ -754,22 +1232,32 @@ bool SampleProfileLoader::doInitialization(Module &M) { return true; } -FunctionPass *llvm::createSampleProfileLoaderPass() { +ModulePass *llvm::createSampleProfileLoaderPass() { return new SampleProfileLoader(SampleProfileFile); } -FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { +ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { return new SampleProfileLoader(Name); } -bool SampleProfileLoader::runOnFunction(Function &F) { +bool SampleProfileLoader::runOnModule(Module &M) { if (!ProfileIsValid) return false; - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - PDT = &getAnalysis<PostDominatorTree>(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - Ctx = &F.getParent()->getContext(); + // Compute the total number of samples collected in this profile. + for (const auto &I : Reader->getProfiles()) + TotalCollectedSamples += I.second.getTotalSamples(); + + bool retval = false; + for (auto &F : M) + if (!F.isDeclaration()) { + clearFunctionData(); + retval |= runOnFunction(F); + } + return retval; +} + +bool SampleProfileLoader::runOnFunction(Function &F) { Samples = Reader->getSamplesFor(F); if (!Samples->empty()) return emitAnnotations(F); diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index 956991ad1f95..c94cc7c74a89 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -7,47 +7,31 @@ // //===----------------------------------------------------------------------===// // -// This pass loops over all of the functions in the input module, looking for +// This pass loops over all of the functions in the input module, looking for // dead declarations and removes them. Dead declarations are declarations of // functions for which no implementation is available (i.e., declarations for // unused library functions). // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/StripDeadPrototypes.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" + using namespace llvm; #define DEBUG_TYPE "strip-dead-prototypes" STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); -namespace { - -/// @brief Pass to remove unused function declarations. -class StripDeadPrototypesPass : public ModulePass { -public: - static char ID; // Pass identification, replacement for typeid - StripDeadPrototypesPass() : ModulePass(ID) { - initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnModule(Module &M) override; -}; - -} // end anonymous namespace - -char StripDeadPrototypesPass::ID = 0; -INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes", - "Strip Unused Function Prototypes", false, false) - -bool StripDeadPrototypesPass::runOnModule(Module &M) { +static bool stripDeadPrototypes(Module &M) { bool MadeChange = false; - + // Erase dead function prototypes. for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { - Function *F = I++; + Function *F = &*I++; // Function must be a prototype and unused. if (F->isDeclaration() && F->use_empty()) { F->eraseFromParent(); @@ -59,16 +43,42 @@ bool StripDeadPrototypesPass::runOnModule(Module &M) { // Erase dead global var prototypes. for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ) { - GlobalVariable *GV = I++; + GlobalVariable *GV = &*I++; // Global must be a prototype and unused. if (GV->isDeclaration() && GV->use_empty()) GV->eraseFromParent(); } - + // Return an indication of whether we changed anything or not. return MadeChange; } +PreservedAnalyses StripDeadPrototypesPass::run(Module &M) { + if (stripDeadPrototypes(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { + +class StripDeadPrototypesLegacyPass : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + StripDeadPrototypesLegacyPass() : ModulePass(ID) { + initializeStripDeadPrototypesLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + return stripDeadPrototypes(M); + } +}; + +} // end anonymous namespace + +char StripDeadPrototypesLegacyPass::ID = 0; +INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes", + "Strip Unused Function Prototypes", false, false) + ModulePass *llvm::createStripDeadPrototypesPass() { - return new StripDeadPrototypesPass(); + return new StripDeadPrototypesLegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index a4f30c58f936..46f352f7f9f1 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -211,13 +211,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { - if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) + if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0) if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage } for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) + if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0) if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo); @@ -305,6 +305,12 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { SmallVector<Metadata *, 64> LiveSubprograms; DenseSet<const MDNode *> VisitedSet; + std::set<DISubprogram *> LiveSPs; + for (Function &F : M) { + if (DISubprogram *SP = F.getSubprogram()) + LiveSPs.insert(SP); + } + for (DICompileUnit *DIC : F.compile_units()) { // Create our live subprogram list. bool SubprogramChange = false; @@ -314,7 +320,7 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { continue; // If the function referenced by DISP is not null, the function is live. - if (DISP->getFunction()) + if (LiveSPs.count(DISP)) LiveSubprograms.push_back(DISP); else SubprogramChange = true; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 2d2c109f3243..6f49399f57bf 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1,4 +1,4 @@ -//===- InstCombineAddSub.cpp ----------------------------------------------===// +//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/PatternMatch.h" + using namespace llvm; using namespace PatternMatch; @@ -67,17 +68,17 @@ namespace { private: bool insaneIntVal(int V) { return V > 4 || V < -4; } - APFloat *getFpValPtr(void) + APFloat *getFpValPtr() { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); } - const APFloat *getFpValPtr(void) const + const APFloat *getFpValPtr() const { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); } - const APFloat &getFpVal(void) const { + const APFloat &getFpVal() const { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } - APFloat &getFpVal(void) { + APFloat &getFpVal() { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } @@ -92,8 +93,8 @@ namespace { // TODO: We should get rid of this function when APFloat can be constructed // from an *SIGNED* integer. APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val); - private: + private: bool IsFp; // True iff FpValBuf contains an instance of APFloat. @@ -114,10 +115,10 @@ namespace { /// class FAddend { public: - FAddend() { Val = nullptr; } + FAddend() : Val(nullptr) {} - Value *getSymVal (void) const { return Val; } - const FAddendCoef &getCoef(void) const { return Coeff; } + Value *getSymVal() const { return Val; } + const FAddendCoef &getCoef() const { return Coeff; } bool isConstant() const { return Val == nullptr; } bool isZero() const { return Coeff.isZero(); } @@ -182,7 +183,6 @@ namespace { InstCombiner::BuilderTy *Builder; Instruction *Instr; - private: // Debugging stuff are clustered here. #ifndef NDEBUG unsigned CreateInstrNum; @@ -193,7 +193,8 @@ namespace { void incCreateInstNum() {} #endif }; -} + +} // anonymous namespace //===----------------------------------------------------------------------===// // @@ -602,7 +603,6 @@ Value *FAddCombine::simplify(Instruction *I) { } Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { - unsigned AddendNum = Addends.size(); assert(AddendNum <= 4 && "Too many addends"); @@ -886,7 +886,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero, return Op0ZeroPosition >= Op1OnePosition; } -/// WillNotOverflowSignedAdd - Return true if we can prove that: +/// Return true if we can prove that: /// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) /// This basically requires proving that the add in the original type would not /// overflow to change the sign bit or have a carry out. @@ -1118,8 +1118,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C, // transform them into (X + (signbit ^ C)) if (XorRHS->getValue().isSignBit()) - return BinaryOperator::CreateAdd(XorLHS, - ConstantExpr::getXor(XorRHS, CI)); + return BinaryOperator::CreateAdd(XorLHS, + ConstantExpr::getXor(XorRHS, CI)); } } @@ -1421,7 +1421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { return Changed ? &I : nullptr; } - /// Optimize pointer differences into the same array into a size. Consider: /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. @@ -1589,7 +1588,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { } } - { Value *Y; // X-(X+Y) == -Y X-(Y+X) == -Y @@ -1611,32 +1609,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAnd(A, B); } - // (sub (select (a, c, b)), (select (a, d, b))) -> (select (a, (sub c, d), 0)) - // (sub (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (sub c, d))) - if (auto *SI0 = dyn_cast<SelectInst>(Op0)) { - if (auto *SI1 = dyn_cast<SelectInst>(Op1)) { - if (SI0->getCondition() == SI1->getCondition()) { - if (Value *V = SimplifySubInst( - SI0->getFalseValue(), SI1->getFalseValue(), I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return SelectInst::Create( - SI0->getCondition(), - Builder->CreateSub(SI0->getTrueValue(), SI1->getTrueValue(), "", - /*HasNUW=*/I.hasNoUnsignedWrap(), - /*HasNSW=*/I.hasNoSignedWrap()), - V); - if (Value *V = SimplifySubInst(SI0->getTrueValue(), SI1->getTrueValue(), - I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return SelectInst::Create( - SI0->getCondition(), V, - Builder->CreateSub(SI0->getFalseValue(), SI1->getFalseValue(), "", - /*HasNUW=*/I.hasNoUnsignedWrap(), - /*HasNSW=*/I.hasNoSignedWrap())); - } - } - } - if (Op0->hasOneUse()) { Value *Y = nullptr; // ((X | Y) - X) --> (~X & Y) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 15e0889b51b7..95c50d32c820 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -37,9 +37,9 @@ static inline Value *dyn_castNotVal(Value *V) { return nullptr; } -/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp -/// predicate into a three bit mask. It also returns whether it is an ordered -/// predicate by reference. +/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into +/// a three bit mask. It also returns whether it is an ordered predicate by +/// reference. static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { isOrdered = false; switch (CC) { @@ -64,10 +64,10 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { } } -/// getNewICmpValue - This is the complement of getICmpCode, which turns an -/// opcode and two operands into either a constant true or false, or a brand -/// new ICmp instruction. The sign is passed in to determine which kind -/// of predicate to use in the new icmp instruction. +/// This is the complement of getICmpCode, which turns an opcode and two +/// operands into either a constant true or false, or a brand new ICmp +/// instruction. The sign is passed in to determine which kind of predicate to +/// use in the new icmp instruction. static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { ICmpInst::Predicate NewPred; @@ -76,9 +76,9 @@ static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, return Builder->CreateICmp(NewPred, LHS, RHS); } -/// getFCmpValue - This is the complement of getFCmpCode, which turns an -/// opcode and two operands into either a FCmp instruction. isordered is passed -/// in to determine which kind of predicate to use in the new fcmp instruction. +/// This is the complement of getFCmpCode, which turns an opcode and two +/// operands into either a FCmp instruction. isordered is passed in to determine +/// which kind of predicate to use in the new fcmp instruction. static Value *getFCmpValue(bool isordered, unsigned code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { @@ -150,14 +150,13 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) { else //if (Op == Instruction::Xor) BinOp = Builder->CreateXor(NewLHS, NewRHS); - Module *M = I.getParent()->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy); return Builder->CreateCall(F, BinOp); } -// OptAndOp - This handles expressions of the form ((val OP C1) & C2). Where -// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is -// guaranteed to be a binary operator. +/// This handles expressions of the form ((val OP C1) & C2). Where +/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is +/// guaranteed to be a binary operator. Instruction *InstCombiner::OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, @@ -341,10 +340,10 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, return Builder->CreateICmpUGT(Add, LowerBound); } -// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with -// any number of 0s on either side. The 1s are allowed to wrap from LSB to -// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is -// not, since all 1s are not contiguous. +/// Returns true iff Val consists of one contiguous run of 1s with any number +/// of 0s on either side. The 1s are allowed to wrap from LSB to MSB, +/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is +/// not, since all 1s are not contiguous. static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { const APInt& V = Val->getValue(); uint32_t BitWidth = Val->getType()->getBitWidth(); @@ -357,9 +356,8 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { return true; } -/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask, -/// where isSub determines whether the operator is a sub. If we can fold one of -/// the following xforms: +/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines +/// whether the operator is a sub. If we can fold one of the following xforms: /// /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 @@ -449,8 +447,8 @@ enum MaskedICmpType { FoldMskICmp_BMask_NotMixed = 512 }; -/// return the set of pattern classes (from MaskedICmpType) -/// that (icmp SCC (A & B), C) satisfies +/// Return the set of pattern classes (from MaskedICmpType) +/// that (icmp SCC (A & B), C) satisfies. static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, ICmpInst::Predicate SCC) { @@ -538,8 +536,8 @@ static unsigned conjugateICmpMask(unsigned Mask) { return NewMask; } -/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z) -/// if possible. The returned predicate is either == or !=. Returns false if +/// Decompose an icmp into the form ((X & Y) pred Z) if possible. +/// The returned predicate is either == or !=. Returns false if /// decomposition fails. static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, Value *&X, Value *&Y, Value *&Z) { @@ -585,10 +583,9 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, return true; } -/// foldLogOpOfMaskedICmpsHelper: -/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// return the set of pattern classes (from MaskedICmpType) -/// that both LHS and RHS satisfy +/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// Return the set of pattern classes (from MaskedICmpType) +/// that both LHS and RHS satisfy. static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, Value*& B, Value*& C, Value*& D, Value*& E, @@ -700,9 +697,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC); return left_type & right_type; } -/// foldLogOpOfMaskedICmps: -/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// into a single (icmp(A & X) ==/!= Y) + +/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy *Builder) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; @@ -879,7 +876,7 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, return Builder->CreateICmp(NewPred, Input, RangeEnd); } -/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. +/// Fold (icmp)&(icmp) if possible. Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1123,9 +1120,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { return nullptr; } -/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function. Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_ORD && RHS->getPredicate() == FCmpInst::FCMP_ORD) { @@ -1203,6 +1199,54 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return nullptr; } +/// Match De Morgan's Laws: +/// (~A & ~B) == (~(A | B)) +/// (~A | ~B) == (~(A & B)) +static Instruction *matchDeMorgansLaws(BinaryOperator &I, + InstCombiner::BuilderTy *Builder) { + auto Opcode = I.getOpcode(); + assert((Opcode == Instruction::And || Opcode == Instruction::Or) && + "Trying to match De Morgan's Laws with something other than and/or"); + // Flip the logic operation. + if (Opcode == Instruction::And) + Opcode = Instruction::Or; + else + Opcode = Instruction::And; + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + // TODO: Use pattern matchers instead of dyn_cast. + if (Value *Op0NotVal = dyn_castNotVal(Op0)) + if (Value *Op1NotVal = dyn_castNotVal(Op1)) + if (Op0->hasOneUse() && Op1->hasOneUse()) { + Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, + I.getName() + ".demorgan"); + return BinaryOperator::CreateNot(LogicOp); + } + + // De Morgan's Law in disguise: + // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B)) + // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B)) + Value *A = nullptr; + Value *B = nullptr; + ConstantInt *C1 = nullptr; + if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) && + match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) { + // TODO: This check could be loosened to handle different type sizes. + // Alternatively, we could fix the definition of m_Not to recognize a not + // operation hidden by a zext? + if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) && + C1->isOne()) { + Value *LogicOp = Builder->CreateBinOp(Opcode, A, B, + I.getName() + ".demorgan"); + Value *Not = Builder->CreateNot(LogicOp); + return CastInst::CreateZExtOrBitCast(Not, I.getType()); + } + } + + return nullptr; +} + Instruction *InstCombiner::visitAnd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -1273,6 +1317,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I)) return BinaryOperator::CreateAnd(V, AndRHS); + // -x & 1 -> x & 1 + if (AndRHSMask == 1 && match(Op0LHS, m_Zero())) + return BinaryOperator::CreateAnd(Op0RHS, AndRHS); + // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS // has 1's for all bits that the subtraction with A might affect. if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) { @@ -1329,15 +1377,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return NV; } - - // (~A & ~B) == (~(A | B)) - De Morgan's Law - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal, - I.getName()+".demorgan"); - return BinaryOperator::CreateNot(Or); - } + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; @@ -1446,14 +1487,15 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return ReplaceInstUsesWith(I, Res); - // fold (and (cast A), (cast B)) -> (cast (and A, B)) - if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { + Value *Op0COp = Op0C->getOperand(0); + Type *SrcTy = Op0COp->getType(); + // fold (and (cast A), (cast B)) -> (cast (and A, B)) if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) { - Type *SrcTy = Op0C->getOperand(0)->getType(); if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ? SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntOrIntVectorTy()) { - Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); + Value *Op1COp = Op1C->getOperand(0); // Only do this if the casts both really cause code to be generated. if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && @@ -1478,6 +1520,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } } + // If we are masking off the sign bit of a floating-point value, convert + // this to the canonical fabs intrinsic call and cast back to integer. + // The backend should know how to optimize fabs(). + // TODO: This transform should also apply to vectors. + ConstantInt *CI; + if (isa<BitCastInst>(Op0C) && SrcTy->isFloatingPointTy() && + match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) { + Module *M = I.getModule(); + Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy); + Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs"); + return CastInst::CreateBitOrPointerCast(Call, I.getType()); + } + } + { Value *X = nullptr; bool OpsSwapped = false; @@ -1509,163 +1565,195 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return Changed ? &I : nullptr; } -/// CollectBSwapParts - Analyze the specified subexpression and see if it is -/// capable of providing pieces of a bswap. The subexpression provides pieces -/// of a bswap if it is proven that each of the non-zero bytes in the output of -/// the expression came from the corresponding "byte swapped" byte in some other -/// value. For example, if the current subexpression is "(shl i32 %X, 24)" then -/// we know that the expression deposits the low byte of %X into the high byte -/// of the bswap result and that all other bytes are zero. This expression is -/// accepted, the high byte of ByteValues is set to X to indicate a correct -/// match. + +/// Analyze the specified subexpression and see if it is capable of providing +/// pieces of a bswap or bitreverse. The subexpression provides a potential +/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in +/// the output of the expression came from a corresponding bit in some other +/// value. This function is recursive, and the end result is a mapping of +/// (value, bitnumber) to bitnumber. It is the caller's responsibility to +/// validate that all `value`s are identical and that the bitnumber to bitnumber +/// mapping is correct for a bswap or bitreverse. +/// +/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know +/// that the expression deposits the low byte of %X into the high byte of the +/// result and that all other bits are zero. This expression is accepted, +/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7]. /// /// This function returns true if the match was unsuccessful and false if so. /// On entry to the function the "OverallLeftShift" is a signed integer value -/// indicating the number of bytes that the subexpression is later shifted. For +/// indicating the number of bits that the subexpression is later shifted. For /// example, if the expression is later right shifted by 16 bits, the -/// OverallLeftShift value would be -2 on entry. This is used to specify which -/// byte of ByteValues is actually being set. +/// OverallLeftShift value would be -16 on entry. This is used to specify which +/// bits of BitValues are actually being set. /// -/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding -/// byte is masked to zero by a user. For example, in (X & 255), X will be -/// processed with a bytemask of 1. Because bytemask is 32-bits, this limits -/// this function to working on up to 32-byte (256 bit) values. ByteMask is -/// always in the local (OverallLeftShift) coordinate space. +/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding +/// bit is masked to zero by a user. For example, in (X & 255), X will be +/// processed with a bytemask of 255. BitMask is always in the local +/// (OverallLeftShift) coordinate space. /// -static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, - SmallVectorImpl<Value *> &ByteValues) { +static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask, + SmallVectorImpl<Value *> &BitValues, + SmallVectorImpl<int> &BitProvenance) { if (Instruction *I = dyn_cast<Instruction>(V)) { // If this is an or instruction, it may be an inner node of the bswap. - if (I->getOpcode() == Instruction::Or) { - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues) || - CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask, - ByteValues); - } - - // If this is a logical shift by a constant multiple of 8, recurse with - // OverallLeftShift and ByteMask adjusted. + if (I->getOpcode() == Instruction::Or) + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance) || + CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask, + BitValues, BitProvenance); + + // If this is a logical shift by a constant, recurse with OverallLeftShift + // and BitMask adjusted. if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) { unsigned ShAmt = - cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); - // Ensure the shift amount is defined and of a byte value. - if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size())) + cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); + // Ensure the shift amount is defined. + if (ShAmt > BitValues.size()) return true; - unsigned ByteShift = ShAmt >> 3; + unsigned BitShift = ShAmt; if (I->getOpcode() == Instruction::Shl) { - // X << 2 -> collect(X, +2) - OverallLeftShift += ByteShift; - ByteMask >>= ByteShift; + // X << C -> collect(X, +C) + OverallLeftShift += BitShift; + BitMask = BitMask.lshr(BitShift); } else { - // X >>u 2 -> collect(X, -2) - OverallLeftShift -= ByteShift; - ByteMask <<= ByteShift; - ByteMask &= (~0U >> (32-ByteValues.size())); + // X >>u C -> collect(X, -C) + OverallLeftShift -= BitShift; + BitMask = BitMask.shl(BitShift); } - if (OverallLeftShift >= (int)ByteValues.size()) return true; - if (OverallLeftShift <= -(int)ByteValues.size()) return true; + if (OverallLeftShift >= (int)BitValues.size()) + return true; + if (OverallLeftShift <= -(int)BitValues.size()) + return true; - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues); + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance); } - // If this is a logical 'and' with a mask that clears bytes, clear the - // corresponding bytes in ByteMask. + // If this is a logical 'and' with a mask that clears bits, clear the + // corresponding bits in BitMask. if (I->getOpcode() == Instruction::And && isa<ConstantInt>(I->getOperand(1))) { - // Scan every byte of the and mask, seeing if the byte is either 0 or 255. - unsigned NumBytes = ByteValues.size(); - APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255); + unsigned NumBits = BitValues.size(); + APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue(); - for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) { - // If this byte is masked out by a later operation, we don't care what + for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) { + // If this bit is masked out by a later operation, we don't care what // the and mask is. - if ((ByteMask & (1 << i)) == 0) + if (BitMask[i] == 0) continue; - // If the AndMask is all zeros for this byte, clear the bit. - APInt MaskB = AndMask & Byte; + // If the AndMask is zero for this bit, clear the bit. + APInt MaskB = AndMask & Bit; if (MaskB == 0) { - ByteMask &= ~(1U << i); + BitMask.clearBit(i); continue; } - // If the AndMask is not all ones for this byte, it's not a bytezap. - if (MaskB != Byte) - return true; - - // Otherwise, this byte is kept. + // Otherwise, this bit is kept. } - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues); + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance); } } // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be - // the input value to the bswap. Some observations: 1) if more than one byte - // is demanded from this input, then it could not be successfully assembled - // into a byteswap. At least one of the two bytes would not be aligned with - // their ultimate destination. - if (!isPowerOf2_32(ByteMask)) return true; - unsigned InputByteNo = countTrailingZeros(ByteMask); - - // 2) The input and ultimate destinations must line up: if byte 3 of an i32 - // is demanded, it needs to go into byte 0 of the result. This means that the - // byte needs to be shifted until it lands in the right byte bucket. The - // shift amount depends on the position: if the byte is coming from the high - // part of the value (e.g. byte 3) then it must be shifted right. If from the - // low part, it must be shifted left. - unsigned DestByteNo = InputByteNo + OverallLeftShift; - if (ByteValues.size()-1-DestByteNo != InputByteNo) + // the input value to the bswap/bitreverse. To be part of a bswap or + // bitreverse we must be demanding a contiguous range of bits from it. + unsigned InputBitLen = BitMask.countPopulation(); + unsigned InputBitNo = BitMask.countTrailingZeros(); + if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo != + InputBitLen) + // Not a contiguous set range of bits! return true; - // If the destination byte value is already defined, the values are or'd - // together, which isn't a bswap (unless it's an or of the same bits). - if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V) + // We know we're moving a contiguous range of bits from the input to the + // output. Record which bits in the output came from which bits in the input. + unsigned DestBitNo = InputBitNo + OverallLeftShift; + for (unsigned I = 0; I < InputBitLen; ++I) + BitProvenance[DestBitNo + I] = InputBitNo + I; + + // If the destination bit value is already defined, the values are or'd + // together, which isn't a bswap/bitreverse (unless it's an or of the same + // bits). + if (BitValues[DestBitNo] && BitValues[DestBitNo] != V) return true; - ByteValues[DestByteNo] = V; + for (unsigned I = 0; I < InputBitLen; ++I) + BitValues[DestBitNo + I] = V; + return false; } -/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom. -/// If so, insert the new bswap intrinsic and return it. -Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { - IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); - if (!ITy || ITy->getBitWidth() % 16 || - // ByteMask only allows up to 32-byte values. - ITy->getBitWidth() > 32*8) - return nullptr; // Can only bswap pairs of bytes. Can't do vectors. +static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, + unsigned BitWidth) { + if (From % 8 != To % 8) + return false; + // Convert from bit indices to byte indices and check for a byte reversal. + From >>= 3; + To >>= 3; + BitWidth >>= 3; + return From == BitWidth - To - 1; +} - /// ByteValues - For each byte of the result, we keep track of which value - /// defines each byte. - SmallVector<Value*, 8> ByteValues; - ByteValues.resize(ITy->getBitWidth()/8); +static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, + unsigned BitWidth) { + return From == BitWidth - To - 1; +} +/// Given an OR instruction, check to see if this is a bswap or bitreverse +/// idiom. If so, insert the new intrinsic and return it. +Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) { + IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); + if (!ITy) + return nullptr; // Can't do vectors. + unsigned BW = ITy->getBitWidth(); + + /// We keep track of which bit (BitProvenance) inside which value (BitValues) + /// defines each bit in the result. + SmallVector<Value *, 8> BitValues(BW, nullptr); + SmallVector<int, 8> BitProvenance(BW, -1); + // Try to find all the pieces corresponding to the bswap. - uint32_t ByteMask = ~0U >> (32-ByteValues.size()); - if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) + APInt BitMask = APInt::getAllOnesValue(BitValues.size()); + if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance)) return nullptr; - // Check to see if all of the bytes come from the same value. - Value *V = ByteValues[0]; - if (!V) return nullptr; // Didn't find a byte? Must be zero. + // Check to see if all of the bits come from the same value. + Value *V = BitValues[0]; + if (!V) return nullptr; // Didn't find a bit? Must be zero. - // Check to make sure that all of the bytes come from the same value. - for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) - if (ByteValues[i] != V) - return nullptr; - Module *M = I.getParent()->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + if (!std::all_of(BitValues.begin(), BitValues.end(), + [&](const Value *X) { return X == V; })) + return nullptr; + + // Now, is the bit permutation correct for a bswap or a bitreverse? We can + // only byteswap values with an even number of bytes. + bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;; + for (unsigned i = 0, e = BitValues.size(); i != e; ++i) { + OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); + OKForBitReverse &= + bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); + } + + Intrinsic::ID Intrin; + if (OKForBSwap) + Intrin = Intrinsic::bswap; + else if (OKForBitReverse) + Intrin = Intrinsic::bitreverse; + else + return nullptr; + + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy); return CallInst::Create(F, V); } -/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D). Check -/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then -/// we can simplify this expression to "cond ? C : D or B". +/// We have an expression of the form (A&C)|(B&D). Check if A is (cond?-1:0) +/// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this +/// expression to "cond ? C : D or B". static Instruction *MatchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D) { // If A is not a select of -1/0, this cannot match. @@ -1688,7 +1776,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return nullptr; } -/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. +/// Fold (icmp)|(icmp) if possible. Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1905,14 +1993,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, case ICmpInst::ICMP_EQ: if (LHS->getOperand(0) == RHS->getOperand(0)) { // if LHSCst and RHSCst differ only by one bit: - // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1 + // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2 assert(LHSCst->getValue().ule(LHSCst->getValue())); APInt Xor = LHSCst->getValue() ^ RHSCst->getValue(); if (Xor.isPowerOf2()) { - Value *NegCst = Builder->getInt(~Xor); - Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst); - return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst); + Value *Cst = Builder->getInt(Xor); + Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst); + return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst); } } @@ -2020,9 +2108,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, return nullptr; } -/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function. Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_UNO && RHS->getPredicate() == FCmpInst::FCMP_UNO && @@ -2080,7 +2167,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return nullptr; } -/// FoldOrWithConstants - This helper function folds: +/// This helper function folds: /// /// ((A | B) & C1) | (B & C2) /// @@ -2199,14 +2286,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { ConstantInt *C1 = nullptr, *C2 = nullptr; // (A | B) | C and A | (B | C) -> bswap if possible. + bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) || + match(Op1, m_Or(m_Value(), m_Value())); // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. - if (match(Op0, m_Or(m_Value(), m_Value())) || - match(Op1, m_Or(m_Value(), m_Value())) || - (match(Op0, m_LogicalShift(m_Value(), m_Value())) && - match(Op1, m_LogicalShift(m_Value(), m_Value())))) { - if (Instruction *BSwap = MatchBSwap(I)) + bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) && + match(Op1, m_LogicalShift(m_Value(), m_Value())); + // (A & B) | (C & D) -> bswap if possible. + bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) && + match(Op1, m_And(m_Value(), m_Value())); + + if (OrOfOrs || OrOfShifts || OrOfAnds) + if (Instruction *BSwap = MatchBSwapOrBitReverse(I)) return BSwap; - } // (X^C)|Y -> (X|Y)^C iff Y&C == 0 if (Op0->hasOneUse() && @@ -2360,14 +2451,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); - // (~A | ~B) == (~(A & B)) - De Morgan's Law - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal, - I.getName()+".demorgan"); - return BinaryOperator::CreateNot(And); - } + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; // Canonicalize xor to the RHS. bool SwappedForXor = false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6de380bcad67..e3634f269cf5 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -67,8 +67,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned CopyAlign = MI->getAlignment(); if (CopyAlign < MinAlign) { - MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), - MinAlign, false)); + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false)); return MI; } @@ -198,12 +197,140 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return nullptr; } +static Value *SimplifyX86immshift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + LogicalShift = false; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + LogicalShift = true; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + LogicalShift = true; ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + // Simplify if count is constant. + auto Arg1 = II.getArgOperand(1); + auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); + auto CDV = dyn_cast<ConstantDataVector>(Arg1); + auto CInt = dyn_cast<ConstantInt>(Arg1); + if (!CAZ && !CDV && !CInt) + return nullptr; + + APInt Count(64, 0); + if (CDV) { + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto VT = cast<VectorType>(CDV->getType()); + unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); + assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); + unsigned NumSubElts = 64 / BitWidth; + + // Concatenate the sub-elements to create the 64-bit value. + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); + Count = Count.shl(BitWidth); + Count |= SubElt->getValue().zextOrTrunc(64); + } + } + else if (CInt) + Count = CInt->getValue(); + + auto Vec = II.getArgOperand(0); + auto VT = cast<VectorType>(Vec->getType()); + auto SVT = VT->getElementType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If shift-by-zero then just return the original value. + if (Count == 0) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *SimplifyX86extend(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, + bool SignExtend) { + VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType()); + VectorType *DstTy = cast<VectorType>(II.getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign/zero extend. + SmallVector<int, 8> ShuffleMask; + for (int i = 0; i != (int)NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0), + UndefValue::get(SrcTy), ShuffleMask); + return SignExtend ? Builder.CreateSExt(SV, DstTy) + : Builder.CreateZExt(SV, DstTy); +} + static Value *SimplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { VectorType *VecTy = cast<VectorType>(II.getType()); assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - + // The immediate permute control byte looks like this: // [3:0] - zero mask for each 32-bit lane // [5:4] - select one 32-bit destination lane @@ -248,12 +375,202 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II, // Replace the selected destination lane with the selected source lane. ShuffleMask[DestLane] = SourceLane + 4; } - + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); } return nullptr; } +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt = Elt.lshr(Index).zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->equalsInt(0)) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit /// source vectors, unless a zero bit is set. If a zero bit is set, /// then ignore that half of the mask and clear that half of the vector. @@ -289,7 +606,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // The high bit of the selection field chooses the 1st or 2nd operand. bool LowInputSelect = Imm & 0x02; bool HighInputSelect = Imm & 0x20; - + // The low bit of the selection field chooses the low or high half // of the selected operand. bool LowHalfSelect = Imm & 0x01; @@ -298,11 +615,11 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // Determine which operand(s) are actually in use for this instruction. Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); - + // If needed, replace operands based on zero mask. V0 = LowHalfZero ? ZeroVector : V0; V1 = HighHalfZero ? ZeroVector : V1; - + // Permute low half of result. unsigned StartIndex = LowHalfSelect ? HalfSize : 0; for (unsigned i = 0; i < HalfSize; ++i) @@ -319,6 +636,43 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, return nullptr; } +/// Decode XOP integer vector comparison intrinsics. +static Value *SimplifyX86vpcom(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + uint64_t Imm = CInt->getZExtValue() & 0x7; + VectorType *VecTy = cast<VectorType>(II.getType()); + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + + switch (Imm) { + case 0x0: + Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; + break; + case 0x1: + Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; + break; + case 0x2: + Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; + break; + case 0x3: + Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; + break; + case 0x4: + Pred = ICmpInst::ICMP_EQ; break; + case 0x5: + Pred = ICmpInst::ICMP_NE; break; + case 0x6: + return ConstantInt::getSigned(VecTy, 0); // FALSE + case 0x7: + return ConstantInt::getSigned(VecTy, -1); // TRUE + } + + if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), II.getArgOperand(1))) + return Builder.CreateSExtOrTrunc(Cmp, VecTy); + } + return nullptr; +} + /// visitCallInst - CallInst simplification. This mostly only handles folding /// of intrinsic instructions. For normal calls, it allows visitCallSite to do /// the heavy lifting. @@ -371,7 +725,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) { if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) if (GVSrc->isConstant()) { - Module *M = CI.getParent()->getParent()->getParent(); + Module *M = CI.getModule(); Intrinsic::ID MemCpyID = Intrinsic::memcpy; Type *Tys[3] = { CI.getArgOperand(0)->getType(), CI.getArgOperand(1)->getType(), @@ -400,6 +754,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Changed) return II; } + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) + { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { @@ -427,6 +788,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::bitreverse: { + Value *IIOperand = II->getArgOperand(0); + Value *X = nullptr; + + // bitreverse(bitreverse(x)) -> x + if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X)))) + return ReplaceInstUsesWith(CI, X); + break; + } + case Intrinsic::powi: if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { // powi(x, 0) -> 1.0 @@ -669,6 +1040,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return new StoreInst(II->getArgOperand(0), Ptr); } break; + case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: @@ -682,6 +1054,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; + case Intrinsic::x86_vcvtph2ps_128: + case Intrinsic::x86_vcvtph2ps_256: { + auto Arg = II->getArgOperand(0); + auto ArgType = cast<VectorType>(Arg->getType()); + auto RetType = cast<VectorType>(II->getType()); + unsigned ArgWidth = ArgType->getNumElements(); + unsigned RetWidth = RetType->getNumElements(); + assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); + assert(ArgType->isIntOrIntVectorTy() && + ArgType->getScalarSizeInBits() == 16 && + "CVTPH2PS input type should be 16-bit integer vector"); + assert(RetType->getScalarType()->isFloatTy() && + "CVTPH2PS output type should be 32-bit float vector"); + + // Constant folding: Convert to generic half to single conversion. + if (isa<ConstantAggregateZero>(Arg)) + return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); + + if (isa<ConstantDataVector>(Arg)) { + auto VectorHalfAsShorts = Arg; + if (RetWidth < ArgWidth) { + SmallVector<int, 8> SubVecMask; + for (unsigned i = 0; i != RetWidth; ++i) + SubVecMask.push_back((int)i); + VectorHalfAsShorts = Builder->CreateShuffleVector( + Arg, UndefValue::get(ArgType), SubVecMask); + } + + auto VectorHalfType = + VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); + auto VectorHalfs = + Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType); + auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType); + return ReplaceInstUsesWith(*II, VectorFloats); + } + + // We only use the lowest lanes of the argument. + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { + II->setArgOperand(0, V); + return II; + } + break; + } + case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse_cvttss2si: @@ -692,194 +1108,229 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_cvttsd2si64: { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. - unsigned VWidth = - cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); - APInt DemandedElts(VWidth, 1); - APInt UndefElts(VWidth, 0); - if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), - DemandedElts, UndefElts)) { + Value *Arg = II->getArgOperand(0); + unsigned VWidth = Arg->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { II->setArgOperand(0, V); return II; } break; } - // Constant fold <A x Bi> << Ci. - // FIXME: We don't handle _dq because it's a shift of an i128, but is - // represented in the IR as <2 x i64>. A per element shift is wrong. - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: + // Constant fold ashr( <A x Bi>, Ci ). + // Constant fold lshr( <A x Bi>, Ci ). + // Constant fold shl( <A x Bi>, Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + if (Value *V = SimplifyX86immshift(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: { - // Simplify if count is constant. To 0 if >= BitWidth, - // otherwise to shl/lshr. - auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1)); - auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1)); - if (!CDV && !CInt) - break; - ConstantInt *Count; - if (CDV) - Count = cast<ConstantInt>(CDV->getElementAsConstant(0)); - else - Count = CInt; - - auto Vec = II->getArgOperand(0); - auto VT = cast<VectorType>(Vec->getType()); - if (Count->getZExtValue() > - VT->getElementType()->getPrimitiveSizeInBits() - 1) - return ReplaceInstUsesWith( - CI, ConstantAggregateZero::get(Vec->getType())); - - bool isPackedShiftLeft = true; - switch (II->getIntrinsicID()) { - default : break; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break; - } - - unsigned VWidth = VT->getNumElements(); - // Get a constant vector of the same type as the first operand. - auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); - if (isPackedShiftLeft) - return BinaryOperator::CreateShl(Vec, - Builder->CreateVectorSplat(VWidth, VTCI)); - - return BinaryOperator::CreateLShr(Vec, - Builder->CreateVectorSplat(VWidth, VTCI)); + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: { + if (Value *V = SimplifyX86immshift(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II->getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = Arg1->getType()->getVectorNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + II->setArgOperand(1, V); + return II; + } + break; } - case Intrinsic::x86_sse41_pmovsxbw: - case Intrinsic::x86_sse41_pmovsxwd: - case Intrinsic::x86_sse41_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxbd: + case Intrinsic::x86_avx2_pmovsxbq: + case Intrinsic::x86_avx2_pmovsxbw: + case Intrinsic::x86_avx2_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxwd: + case Intrinsic::x86_avx2_pmovsxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_pmovzxbd: + case Intrinsic::x86_sse41_pmovzxbq: case Intrinsic::x86_sse41_pmovzxbw: + case Intrinsic::x86_sse41_pmovzxdq: case Intrinsic::x86_sse41_pmovzxwd: - case Intrinsic::x86_sse41_pmovzxdq: { - // pmov{s|z}x ignores the upper half of their input vectors. - unsigned VWidth = - cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); - unsigned LowHalfElts = VWidth / 2; - APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); - APInt UndefElts(VWidth, 0); - if (Value *TmpV = SimplifyDemandedVectorElts( - II->getArgOperand(0), InputDemandedElts, UndefElts)) { - II->setArgOperand(0, TmpV); + case Intrinsic::x86_sse41_pmovzxwq: + case Intrinsic::x86_avx2_pmovzxbd: + case Intrinsic::x86_avx2_pmovzxbq: + case Intrinsic::x86_avx2_pmovzxbw: + case Intrinsic::x86_avx2_pmovzxdq: + case Intrinsic::x86_avx2_pmovzxwd: + case Intrinsic::x86_avx2_pmovzxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_insertps: + if (Value *V = SimplifyX86insertps(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CILength = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + II->setArgOperand(1, V); return II; } break; } - case Intrinsic::x86_sse41_insertps: - if (Value *V = SimplifyX86insertps(*II, *Builder)) + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II->getArgOperand(0); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) return ReplaceInstUsesWith(*II, V); + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + Op1->getType()->getVectorNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + APInt V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } break; - + } + case Intrinsic::x86_sse4a_insertqi: { - // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top - // ones undef - // TODO: eventually we should lower this intrinsic to IR - if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) { - if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) { - unsigned Index = CIStart->getZExtValue(); - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if ((Index + Length) > 64) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - - if (Length == 64 && Index == 0) { - Value *Vec = II->getArgOperand(1); - Value *Undef = UndefValue::get(Vec->getType()); - const uint32_t Mask[] = { 0, 2 }; - return ReplaceInstUsesWith( - CI, - Builder->CreateShuffleVector( - Vec, Undef, ConstantDataVector::get( - II->getContext(), makeArrayRef(Mask)))); - - } else if (auto Source = - dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { - if (Source->hasOneUse() && - Source->getArgOperand(1) == II->getArgOperand(1)) { - // If the source of the insert has only one use and it's another - // insert (and they're both inserting from the same vector), try to - // bundle both together. - auto CISourceWidth = - dyn_cast<ConstantInt>(Source->getArgOperand(2)); - auto CISourceStart = - dyn_cast<ConstantInt>(Source->getArgOperand(3)); - if (CISourceStart && CISourceWidth) { - unsigned Start = CIStart->getZExtValue(); - unsigned Width = CIWidth->getZExtValue(); - unsigned End = Start + Width; - unsigned SourceStart = CISourceStart->getZExtValue(); - unsigned SourceWidth = CISourceWidth->getZExtValue(); - unsigned SourceEnd = SourceStart + SourceWidth; - unsigned NewStart, NewWidth; - bool ShouldReplace = false; - if (Start <= SourceStart && SourceStart <= End) { - NewStart = Start; - NewWidth = std::max(End, SourceEnd) - NewStart; - ShouldReplace = true; - } else if (SourceStart <= Start && Start <= SourceEnd) { - NewStart = SourceStart; - NewWidth = std::max(SourceEnd, End) - NewStart; - ShouldReplace = true; - } - - if (ShouldReplace) { - Constant *ConstantWidth = ConstantInt::get( - II->getArgOperand(2)->getType(), NewWidth, false); - Constant *ConstantStart = ConstantInt::get( - II->getArgOperand(3)->getType(), NewStart, false); - Value *Args[4] = { Source->getArgOperand(0), - II->getArgOperand(1), ConstantWidth, - ConstantStart }; - Module *M = CI.getParent()->getParent()->getParent(); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); - } - } - } - } - } + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + II->setArgOperand(1, V); + return II; } break; } @@ -894,7 +1345,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // This optimization is convoluted because the intrinsic is defined as // getting a vector of floats or doubles for the ps and pd versions. // FIXME: That should be changed. + + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); Value *Mask = II->getArgOperand(2); + + // fold (blend A, A, Mask) -> A + if (Op0 == Op1) + return ReplaceInstUsesWith(CI, Op0); + + // Zero Mask - select 1st argument. + if (isa<ConstantAggregateZero>(Mask)) + return ReplaceInstUsesWith(CI, Op0); + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. if (auto C = dyn_cast<ConstantDataVector>(Mask)) { auto Tyi1 = Builder->getInt1Ty(); auto SelectorType = cast<VectorType>(Mask->getType()); @@ -917,11 +1381,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1))); } auto NewSelector = ConstantVector::get(Selectors); - return SelectInst::Create(NewSelector, II->getArgOperand(1), - II->getArgOperand(0), "blendv"); - } else { - break; + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); } + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: { + // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant. + auto *V = II->getArgOperand(1); + auto *VTy = cast<VectorType>(V->getType()); + unsigned NumElts = VTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32) && + "Unexpected number of elements in shuffle mask!"); + // Initialize the resulting shuffle mask to all zeroes. + uint32_t Indexes[32] = {0}; + + if (auto *Mask = dyn_cast<ConstantDataVector>(V)) { + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + int8_t Index = Mask->getElementAsInteger(I); + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index is the least significant 4 bits of the + // shuffle control byte. + Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; + } + } else if (!isa<ConstantAggregateZero>(V)) + break; + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + for (unsigned I = 16; I < NumElts; ++I) + Indexes[I] += I & 0xF0; + + auto NewC = ConstantDataVector::get(V->getContext(), + makeArrayRef(Indexes, NumElts)); + auto V1 = II->getArgOperand(0); + auto V2 = Constant::getNullValue(II->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); } case Intrinsic::x86_avx_vpermilvar_ps: @@ -972,6 +1475,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(*II, V); break; + case Intrinsic::x86_xop_vpcomb: + case Intrinsic::x86_xop_vpcomd: + case Intrinsic::x86_xop_vpcomq: + case Intrinsic::x86_xop_vpcomw: + if (Value *V = SimplifyX86vpcom(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_xop_vpcomub: + case Intrinsic::x86_xop_vpcomud: + case Intrinsic::x86_xop_vpcomuq: + case Intrinsic::x86_xop_vpcomuw: + if (Value *V = SimplifyX86vpcom(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -1115,15 +1634,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // happen when variable allocas are DCE'd. if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { if (SS->getIntrinsicID() == Intrinsic::stacksave) { - BasicBlock::iterator BI = SS; - if (&*++BI == II) + if (&*++SS->getIterator() == II) return EraseInstFromFunction(CI); } } // Scan down this block to see if there is another stack restore in the // same block without an intervening call/alloca. - BasicBlock::iterator BI = II; + BasicBlock::iterator BI(II); TerminatorInst *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { @@ -1153,6 +1671,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return EraseInstFromFunction(CI); break; } + case Intrinsic::lifetime_start: { + // Remove trivially empty lifetime_start/end ranges, i.e. a start + // immediately followed by an end (ignoring debuginfo or other + // lifetime markers in between). + BasicBlock::iterator BI = II->getIterator(), BE = II->getParent()->end(); + for (++BI; BI != BE; ++BI) { + if (IntrinsicInst *LTE = dyn_cast<IntrinsicInst>(BI)) { + if (isa<DbgInfoIntrinsic>(LTE) || + LTE->getIntrinsicID() == Intrinsic::lifetime_start) + continue; + if (LTE->getIntrinsicID() == Intrinsic::lifetime_end) { + if (II->getOperand(0) == LTE->getOperand(0) && + II->getOperand(1) == LTE->getOperand(1)) { + EraseInstFromFunction(*LTE); + return EraseInstFromFunction(*II); + } + continue; + } + } + break; + } + break; + } case Intrinsic::assume: { // Canonicalize assume(a && b) -> assume(a); assume(b); // Note: New assumption intrinsics created here are registered by @@ -1233,7 +1774,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } // isKnownNonNull -> nonnull attribute - if (isKnownNonNull(DerivedPtr)) + if (isKnownNonNullAt(DerivedPtr, II, DT, TLI)) II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); // isDereferenceablePointer -> deref attribute @@ -1355,9 +1896,10 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp, Value *TrampMem) { // Visit all the previous instructions in the basic block, and try to find a // init.trampoline which has a direct path to the adjust.trampoline. - for (BasicBlock::iterator I = AdjustTramp, - E = AdjustTramp->getParent()->begin(); I != E; ) { - Instruction *Inst = --I; + for (BasicBlock::iterator I = AdjustTramp->getIterator(), + E = AdjustTramp->getParent()->begin(); + I != E;) { + Instruction *Inst = &*--I; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) if (II->getIntrinsicID() == Intrinsic::init_trampoline && II->getOperand(0) == TrampMem) @@ -1400,20 +1942,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // Mark any parameters that are known to be non-null with the nonnull // attribute. This is helpful for inlining calls to functions with null // checks on their arguments. + SmallVector<unsigned, 4> Indices; unsigned ArgNo = 0; + for (Value *V : CS.args()) { - if (!CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && - isKnownNonNull(V)) { - AttributeSet AS = CS.getAttributes(); - AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1, - Attribute::NonNull); - CS.setAttributes(AS); - Changed = true; - } + if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && + isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) + Indices.push_back(ArgNo + 1); ArgNo++; } + assert(ArgNo == CS.arg_size() && "sanity check"); + if (!Indices.empty()) { + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, + Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + Changed = true; + } + // If the callee is a pointer to a function, attempt to move any casts to the // arguments of the call/invoke. Value *Callee = CS.getCalledValue(); @@ -1725,16 +2274,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(), attrVec); + SmallVector<OperandBundleDef, 1> OpBundles; + CS.getOperandBundlesAsDefs(OpBundles); + Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { - NC = Builder->CreateInvoke(Callee, II->getNormalDest(), - II->getUnwindDest(), Args); + NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles); NC->takeName(II); cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv()); cast<InvokeInst>(NC)->setAttributes(NewCallerPAL); } else { CallInst *CI = cast<CallInst>(Caller); - NC = Builder->CreateCall(Callee, Args); + NC = Builder->CreateCall(Callee, Args, OpBundles); NC->takeName(CI); if (CI->isTailCall()) cast<CallInst>(NC)->setTailCall(); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 48ab0eb2c1b9..da835a192322 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -21,11 +21,11 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear -/// expression. If so, decompose it, returning some value X, such that Val is +/// Analyze 'Val', seeing if it is a simple linear expression. +/// If so, decompose it, returning some value X, such that Val is /// X*Scale+Offset. /// -static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, +static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, uint64_t &Offset) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { Offset = CI->getZExtValue(); @@ -62,7 +62,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, // where C1 is divisible by C2. unsigned SubScale; Value *SubVal = - DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); + decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); Offset += RHS->getZExtValue(); Scale = SubScale; return SubVal; @@ -76,14 +76,14 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, return Val; } -/// PromoteCastOfAllocation - If we find a cast of an allocation instruction, -/// try to eliminate the cast by moving the type information into the alloc. +/// If we find a cast of an allocation instruction, try to eliminate the cast by +/// moving the type information into the alloc. Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { PointerType *PTy = cast<PointerType>(CI.getType()); BuilderTy AllocaBuilder(*Builder); - AllocaBuilder.SetInsertPoint(AI.getParent(), &AI); + AllocaBuilder.SetInsertPoint(&AI); // Get the type really allocated and the type casted to. Type *AllocElTy = AI.getAllocatedType(); @@ -114,7 +114,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, unsigned ArraySizeScale; uint64_t ArrayOffset; Value *NumElements = // See if the array size is a decomposable linear expr. - DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); + decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); // If we can now satisfy the modulus, by using a non-1 scale, we really can // do the xform. @@ -154,9 +154,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, return ReplaceInstUsesWith(CI, New); } -/// EvaluateInDifferentType - Given an expression that -/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually -/// insert the code to evaluate the expression. +/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns +/// true for, actually insert the code to evaluate the expression. Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned) { if (Constant *C = dyn_cast<Constant>(V)) { @@ -261,9 +260,9 @@ isEliminableCastPair(const CastInst *CI, ///< First cast instruction return Instruction::CastOps(Res); } -/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually -/// results in any code being generated and is interesting to optimize out. If -/// the cast can be eliminated by some other simple transformation, we prefer +/// Return true if the cast from "V to Ty" actually results in any code being +/// generated and is interesting to optimize out. +/// If the cast can be eliminated by some other simple transformation, we prefer /// to do the simplification first. bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V, Type *Ty) { @@ -318,9 +317,9 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { return nullptr; } -/// CanEvaluateTruncated - Return true if we can evaluate the specified -/// expression tree as type Ty instead of its larger type, and arrive with the -/// same value. This is used by code that tries to eliminate truncates. +/// Return true if we can evaluate the specified expression tree as type Ty +/// instead of its larger type, and arrive with the same value. +/// This is used by code that tries to eliminate truncates. /// /// Ty will always be a type smaller than V. We should return true if trunc(V) /// can be computed by computing V in the smaller type. If V is an instruction, @@ -329,7 +328,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, +static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, Instruction *CxtI) { // We can always evaluate constants in another type. if (isa<Constant>(V)) @@ -359,8 +358,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, case Instruction::Or: case Instruction::Xor: // These operators can all arbitrarily be extended or truncated. - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); case Instruction::UDiv: case Instruction::URem: { @@ -371,8 +370,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth); if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) && IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); } } break; @@ -383,7 +382,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { uint32_t BitWidth = Ty->getScalarSizeInBits(); if (CI->getLimitedValue(BitWidth) < BitWidth) - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); } break; case Instruction::LShr: @@ -396,7 +395,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, if (IC.MaskedValueIsZero(I->getOperand(0), APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) && CI->getLimitedValue(BitWidth) < BitWidth) { - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); } } break; @@ -410,8 +409,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, return true; case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); - return CanEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && - CanEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); + return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && + canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); } case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -419,7 +418,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateTruncated(IncValue, Ty, IC, CxtI)) + if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI)) return false; return true; } @@ -431,6 +430,50 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, return false; } +/// Given a vector that is bitcast to an integer, optionally logically +/// right-shifted, and truncated, convert it to an extractelement. +/// Example (big endian): +/// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 +/// ---> +/// extractelement <4 x i32> %X, 1 +static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC, + const DataLayout &DL) { + Value *TruncOp = Trunc.getOperand(0); + Type *DestType = Trunc.getType(); + if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType)) + return nullptr; + + Value *VecInput = nullptr; + ConstantInt *ShiftVal = nullptr; + if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)), + m_LShr(m_BitCast(m_Value(VecInput)), + m_ConstantInt(ShiftVal)))) || + !isa<VectorType>(VecInput->getType())) + return nullptr; + + VectorType *VecType = cast<VectorType>(VecInput->getType()); + unsigned VecWidth = VecType->getPrimitiveSizeInBits(); + unsigned DestWidth = DestType->getPrimitiveSizeInBits(); + unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0; + + if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0)) + return nullptr; + + // If the element type of the vector doesn't match the result type, + // bitcast it to a vector type that we can extract from. + unsigned NumVecElts = VecWidth / DestWidth; + if (VecType->getElementType() != DestType) { + VecType = VectorType::get(DestType, NumVecElts); + VecInput = IC.Builder->CreateBitCast(VecInput, VecType, "bc"); + } + + unsigned Elt = ShiftAmount / DestWidth; + if (DL.isBigEndian()) + Elt = NumVecElts - 1 - Elt; + + return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); +} + Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (Instruction *Result = commonCastTransforms(CI)) return Result; @@ -441,7 +484,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // min/max. Value *LHS, *RHS; if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0))) - if (matchSelectPattern(SI, LHS, RHS) != SPF_UNKNOWN) + if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; // See if we can simplify any instructions used by the input whose sole @@ -457,7 +500,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateTruncated(Src, DestTy, *this, &CI)) { + canEvaluateTruncated(Src, DestTy, *this, &CI)) { // If this cast is a truncate, evaluting in a different type always // eliminates the cast, so it is always a win. @@ -470,7 +513,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { - Constant *One = ConstantInt::get(Src->getType(), 1); + Constant *One = ConstantInt::get(SrcTy, 1); Src = Builder->CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); @@ -489,31 +532,54 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // If the shift amount is larger than the size of A, then the result is // known to be zero because all the input bits got shifted out. if (Cst->getZExtValue() >= ASize) - return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType())); + return ReplaceInstUsesWith(CI, Constant::getNullValue(DestTy)); // Since we're doing an lshr and a zero extend, and know that the shift // amount is smaller than ASize, it is always safe to do the shift in A's // type, then zero extend or truncate to the result. Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue()); Shift->takeName(Src); - return CastInst::CreateIntegerCast(Shift, CI.getType(), false); + return CastInst::CreateIntegerCast(Shift, DestTy, false); + } + + // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type + // conversion. + // It works because bits coming from sign extension have the same value as + // the sign bit of the original value; performing ashr instead of lshr + // generates bits of the same value as the sign bit. + if (Src->hasOneUse() && + match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && + cast<Instruction>(Src)->getOperand(0)->hasOneUse()) { + const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + // This optimization can be only performed when zero bits generated by + // the original lshr aren't pulled into the value after truncation, so we + // can only shift by values smaller than the size of destination type (in + // bits). + if (Cst->getValue().ult(ASize)) { + Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); + Shift->takeName(Src); + return CastInst::CreateIntegerCast(Shift, CI.getType(), true); + } } // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest // type isn't non-native. - if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) && - ShouldChangeType(Src->getType(), CI.getType()) && + if (Src->hasOneUse() && isa<IntegerType>(SrcTy) && + ShouldChangeType(SrcTy, DestTy) && match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) { - Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr"); + Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr"); return BinaryOperator::CreateAnd(NewTrunc, - ConstantExpr::getTrunc(Cst, CI.getType())); + ConstantExpr::getTrunc(Cst, DestTy)); } + if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL)) + return I; + return nullptr; } -/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (zext icmp) to bitwise / integer operations in order to eliminate +/// the icmp. Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, bool DoXform) { // If we are just checking for a icmp eq of a single bit and zext'ing it @@ -637,8 +703,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, return nullptr; } -/// CanEvaluateZExtd - Determine if the specified value can be computed in the -/// specified wider type and produce the same low bits. If not, return false. +/// Determine if the specified value can be computed in the specified wider type +/// and produce the same low bits. If not, return false. /// /// If this function returns true, it can also return a non-zero number of bits /// (in BitsToClear) which indicates that the value it computes is correct for @@ -655,7 +721,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, /// clear the top bits anyway, doing this has no extra cost. /// /// This function works on both vectors and scalars. -static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, +static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, InstCombiner &IC, Instruction *CxtI) { BitsToClear = 0; if (isa<Constant>(V)) @@ -685,8 +751,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, case Instruction::Add: case Instruction::Sub: case Instruction::Mul: - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) return false; // These can all be promoted if neither operand has 'bits to clear'. if (BitsToClear == 0 && Tmp == 0) @@ -713,7 +779,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // We can promote shl(x, cst) if we can promote x. Since shl overwrites the // upper bits we can reduce BitsToClear by the shift amount. if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; uint64_t ShiftAmt = Amt->getZExtValue(); BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; @@ -724,7 +790,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; BitsToClear += Amt->getZExtValue(); if (BitsToClear > V->getType()->getScalarSizeInBits()) @@ -734,8 +800,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // Cannot promote variable LSHR. return false; case Instruction::Select: - if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || + if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || // TODO: If important, we could handle the case when the BitsToClear are // known zero in the disagreeing side. Tmp != BitsToClear) @@ -747,10 +813,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast<PHINode>(I); - if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) return false; for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || + if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || // TODO: If important, we could handle the case when the BitsToClear // are known zero in the disagreeing input. Tmp != BitsToClear) @@ -787,13 +853,13 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { // strange. unsigned BitsToClear; if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { + canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { assert(BitsToClear < SrcTy->getScalarSizeInBits() && "Unreasonable BitsToClear"); // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid zero extend: " << CI); + " to avoid zero extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, false); assert(Res->getType() == DestTy); @@ -897,8 +963,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { return nullptr; } -/// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp. Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -985,15 +1050,14 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { return nullptr; } -/// CanEvaluateSExtd - Return true if we can take the specified value -/// and return it as type Ty without inserting any new casts and without -/// changing the value of the common low bits. This is used by code that tries -/// to promote integer operations to a wider types will allow us to eliminate -/// the extension. +/// Return true if we can take the specified value and return it as type Ty +/// without inserting any new casts and without changing the value of the common +/// low bits. This is used by code that tries to promote integer operations to +/// a wider types will allow us to eliminate the extension. /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateSExtd(Value *V, Type *Ty) { +static bool canEvaluateSExtd(Value *V, Type *Ty) { assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && "Can't sign extend type to a smaller type"); // If this is a constant, it can be trivially promoted. @@ -1023,15 +1087,15 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { case Instruction::Sub: case Instruction::Mul: // These operators can all arbitrarily be extended if their inputs can. - return CanEvaluateSExtd(I->getOperand(0), Ty) && - CanEvaluateSExtd(I->getOperand(1), Ty); + return canEvaluateSExtd(I->getOperand(0), Ty) && + canEvaluateSExtd(I->getOperand(1), Ty); //case Instruction::Shl: TODO //case Instruction::LShr: TODO case Instruction::Select: - return CanEvaluateSExtd(I->getOperand(1), Ty) && - CanEvaluateSExtd(I->getOperand(2), Ty); + return canEvaluateSExtd(I->getOperand(1), Ty) && + canEvaluateSExtd(I->getOperand(2), Ty); case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -1039,7 +1103,7 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateSExtd(IncValue, Ty)) return false; + if (!canEvaluateSExtd(IncValue, Ty)) return false; return true; } default: @@ -1081,10 +1145,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateSExtd(Src, DestTy)) { + canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid sign extend: " << CI); + " to avoid sign extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, true); assert(Res->getType() == DestTy); @@ -1149,9 +1213,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { } -/// FitsInFPType - Return a Constant* for the specified FP constant if it fits +/// Return a Constant* for the specified floating-point constant if it fits /// in the specified FP type without changing its value. -static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { +static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { bool losesInfo; APFloat F = CFP->getValueAPF(); (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); @@ -1160,12 +1224,12 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { return nullptr; } -/// LookThroughFPExtensions - If this is an fp extension instruction, look +/// If this is a floating-point extension instruction, look /// through it until we get the source value. -static Value *LookThroughFPExtensions(Value *V) { +static Value *lookThroughFPExtensions(Value *V) { if (Instruction *I = dyn_cast<Instruction>(V)) if (I->getOpcode() == Instruction::FPExt) - return LookThroughFPExtensions(I->getOperand(0)); + return lookThroughFPExtensions(I->getOperand(0)); // If this value is a constant, return the constant in the smallest FP type // that can accurately represent it. This allows us to turn @@ -1174,14 +1238,14 @@ static Value *LookThroughFPExtensions(Value *V) { if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext())) return V; // No constant folding of this. // See if the value can be truncated to half and then reextended. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf)) return V; // See if the value can be truncated to float and then reextended. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle)) return V; if (CFP->getType()->isDoubleTy()) return V; // Won't shrink. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble)) return V; // Don't try to shrink to various long double types. } @@ -1193,7 +1257,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { if (Instruction *I = commonCastTransforms(CI)) return I; // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to - // simpilify this expression to avoid one or more of the trunc/extend + // simplify this expression to avoid one or more of the trunc/extend // operations if we can do so without changing the numerical results. // // The exact manner in which the widths of the operands interact to limit @@ -1201,8 +1265,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // is explained below in the various case statements. BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0)); if (OpI && OpI->hasOneUse()) { - Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0)); - Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1)); + Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0)); + Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1)); unsigned OpWidth = OpI->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth(); unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth(); @@ -1307,10 +1371,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // (fptrunc (select cond, R1, Cst)) --> // (select cond, (fptrunc R1), (fptrunc Cst)) + // + // - but only if this isn't part of a min/max operation, else we'll + // ruin min/max canonical form which is to have the select and + // compare's operands be of the same type with no casts to look through. + Value *LHS, *RHS; SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)); if (SI && (isa<ConstantFP>(SI->getOperand(1)) || - isa<ConstantFP>(SI->getOperand(2)))) { + isa<ConstantFP>(SI->getOperand(2))) && + matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) { Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1), CI.getType()); Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2), @@ -1327,9 +1397,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0), CI.getType()); Type *IntrinsicType[] = { CI.getType() }; - Function *Overload = - Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(), - II->getIntrinsicID(), IntrinsicType); + Function *Overload = Intrinsic::getDeclaration( + CI.getModule(), II->getIntrinsicID(), IntrinsicType); Value *Args[] = { InnerTrunc }; return CallInst::Create(Overload, Args, II->getName()); @@ -1483,12 +1552,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false); } -/// OptimizeVectorResize - This input value (which is known to have vector type) -/// is being zero extended or truncated to the specified vector type. Try to -/// replace it with a shuffle (and vector/vector bitcast) if possible. +/// This input value (which is known to have vector type) is being zero extended +/// or truncated to the specified vector type. +/// Try to replace it with a shuffle (and vector/vector bitcast) if possible. /// /// The source and destination vector types may have different element types. -static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy, +static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, InstCombiner &IC) { // We can only do this optimization if the output is a multiple of the input // element size, or the input is a multiple of the output element size. @@ -1548,8 +1617,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { return Value / Ty->getPrimitiveSizeInBits(); } -/// CollectInsertionElements - V is a value which is inserted into a vector of -/// VecEltTy. Look through the value to see if we can decompose it into +/// V is a value which is inserted into a vector of VecEltTy. +/// Look through the value to see if we can decompose it into /// insertions into the vector. See the example in the comment for /// OptimizeIntegerToVectorInsertions for the pattern this handles. /// The type of V is always a non-zero multiple of VecEltTy's size. @@ -1558,7 +1627,7 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { /// /// This returns false if the pattern can't be matched or true if it can, /// filling in Elements with the elements found here. -static bool CollectInsertionElements(Value *V, unsigned Shift, +static bool collectInsertionElements(Value *V, unsigned Shift, SmallVectorImpl<Value *> &Elements, Type *VecEltTy, bool isBigEndian) { assert(isMultipleOfTypeSize(Shift, VecEltTy) && @@ -1595,7 +1664,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, // If the constant is the size of a vector element, we just need to bitcast // it to the right type so it gets properly inserted. if (NumElts == 1) - return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), + return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), Shift, Elements, VecEltTy, isBigEndian); // Okay, this is a constant that covers multiple elements. Slice it up into @@ -1611,7 +1680,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), ShiftI)); Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, + if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy, isBigEndian)) return false; } @@ -1625,19 +1694,19 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); case Instruction::ZExt: if (!isMultipleOfTypeSize( I->getOperand(0)->getType()->getPrimitiveSizeInBits(), VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); case Instruction::Or: - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian) && - CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, + collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, isBigEndian); case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. @@ -1645,7 +1714,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, if (!CI) return false; Shift += CI->getZExtValue(); if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); } @@ -1653,8 +1722,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, } -/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we -/// may be doing shifts and ors to assemble the elements of the vector manually. +/// If the input is an 'or' instruction, we may be doing shifts and ors to +/// assemble the elements of the vector manually. /// Try to rip the code out and replace it with insertelements. This is to /// optimize code like this: /// @@ -1667,13 +1736,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, /// %tmp43 = bitcast i64 %ins35 to <2 x float> /// /// Into two insertelements that do "buildvector{%inc, %inc5}". -static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, +static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); - if (!CollectInsertionElements(IntInput, 0, Elements, + if (!collectInsertionElements(IntInput, 0, Elements, DestVecTy->getElementType(), IC.getDataLayout().isBigEndian())) return nullptr; @@ -1692,63 +1761,29 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, return Result; } - -/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double -/// bitcast. The various long double bitcasts can't get in here. -static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC, +/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the +/// vector followed by extract element. The backend tends to handle bitcasts of +/// vectors better than bitcasts of scalars because vector registers are +/// usually not type-specific like scalar integer or scalar floating-point. +static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, + InstCombiner &IC, const DataLayout &DL) { - Value *Src = CI.getOperand(0); - Type *DestTy = CI.getType(); - - // If this is a bitcast from int to float, check to see if the int is an - // extraction from a vector. - Value *VecInput = nullptr; - // bitcast(trunc(bitcast(somevector))) - if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) && - isa<VectorType>(VecInput->getType())) { - VectorType *VecTy = cast<VectorType>(VecInput->getType()); - unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - - if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) { - // If the element type of the vector doesn't match the result type, - // bitcast it to be a vector type we can extract from. - if (VecTy->getElementType() != DestTy) { - VecTy = VectorType::get(DestTy, - VecTy->getPrimitiveSizeInBits() / DestWidth); - VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); - } - - unsigned Elt = 0; - if (DL.isBigEndian()) - Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1; - return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); - } - } + // TODO: Create and use a pattern matcher for ExtractElementInst. + auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0)); + if (!ExtElt || !ExtElt->hasOneUse()) + return nullptr; - // bitcast(trunc(lshr(bitcast(somevector), cst)) - ConstantInt *ShAmt = nullptr; - if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)), - m_ConstantInt(ShAmt)))) && - isa<VectorType>(VecInput->getType())) { - VectorType *VecTy = cast<VectorType>(VecInput->getType()); - unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 && - ShAmt->getZExtValue() % DestWidth == 0) { - // If the element type of the vector doesn't match the result type, - // bitcast it to be a vector type we can extract from. - if (VecTy->getElementType() != DestTy) { - VecTy = VectorType::get(DestTy, - VecTy->getPrimitiveSizeInBits() / DestWidth); - VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); - } + // The bitcast must be to a vectorizable type, otherwise we can't make a new + // type to extract from. + Type *DestType = BitCast.getType(); + if (!VectorType::isValidElementType(DestType)) + return nullptr; - unsigned Elt = ShAmt->getZExtValue() / DestWidth; - if (DL.isBigEndian()) - Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt; - return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); - } - } - return nullptr; + unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements(); + auto *NewVecType = VectorType::get(DestType, NumElts); + auto *NewBC = IC.Builder->CreateBitCast(ExtElt->getVectorOperand(), + NewVecType, "bc"); + return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand()); } Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { @@ -1794,11 +1829,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } - // Try to optimize int -> float bitcasts. - if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy)) - if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL)) - return I; - if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) { if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) { Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType()); @@ -1815,7 +1845,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { CastInst *SrcCast = cast<CastInst>(Src); if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) if (isa<VectorType>(BCIn->getOperand(0)->getType())) - if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0), + if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0), cast<VectorType>(DestTy), *this)) return I; } @@ -1823,7 +1853,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If the input is an 'or' instruction, we may be doing shifts and ors to // assemble the elements of the vector manually. Try to rip the code out // and replace it with insertelements. - if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this)) + if (Value *V = optimizeIntegerToVectorInsertions(CI, *this)) return ReplaceInstUsesWith(CI, V); } } @@ -1872,6 +1902,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } + if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL)) + return I; + if (SrcTy->isPointerTy()) return commonPointerCastTransforms(CI); return commonCastTransforms(CI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 95bba3c7af7d..c0786afe965e 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -216,8 +216,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, Max = KnownOne|UnknownBits; } - - /// FoldCmpLoadFromIndexedGlobal - Called we see this pattern: /// cmp pred (load (gep GV, ...)), cmpcst /// where GV is a global variable with a constant initializer. Try to simplify @@ -371,7 +369,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, } } - // If this element is in range, update our magic bitvector. if (i < 64 && IsTrueForElt) MagicBitvector |= 1ULL << i; @@ -469,7 +466,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); } - // If a magic bitvector captures the entire comparison state // of this load, replace it with computation that does: // ((magic_cst >> i) & 1) != 0 @@ -496,7 +492,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, return nullptr; } - /// EvaluateGEPOffsetExpression - Return a value that can be used to compare /// the *offset* implied by a GEP to zero. For example, if we have &A[i], we /// want to return 'i' for "icmp ne i, 0". Note that, in general, indices can @@ -562,8 +557,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, } } - - // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. @@ -737,6 +730,83 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, return nullptr; } +Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, + Value *Other) { + assert(ICI.isEquality() && "Cannot fold non-equality comparison."); + + // It would be tempting to fold away comparisons between allocas and any + // pointer not based on that alloca (e.g. an argument). However, even + // though such pointers cannot alias, they can still compare equal. + // + // But LLVM doesn't specify where allocas get their memory, so if the alloca + // doesn't escape we can argue that it's impossible to guess its value, and we + // can therefore act as if any such guesses are wrong. + // + // The code below checks that the alloca doesn't escape, and that it's only + // used in a comparison once (the current instruction). The + // single-comparison-use condition ensures that we're trivially folding all + // comparisons against the alloca consistently, and avoids the risk of + // erroneously folding a comparison of the pointer with itself. + + unsigned MaxIter = 32; // Break cycles and bound to constant-time. + + SmallVector<Use *, 32> Worklist; + for (Use &U : Alloca->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + + unsigned NumCmps = 0; + while (!Worklist.empty()) { + assert(Worklist.size() <= MaxIter); + Use *U = Worklist.pop_back_val(); + Value *V = U->getUser(); + --MaxIter; + + if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) || + isa<SelectInst>(V)) { + // Track the uses. + } else if (isa<LoadInst>(V)) { + // Loading from the pointer doesn't escape it. + continue; + } else if (auto *SI = dyn_cast<StoreInst>(V)) { + // Storing *to* the pointer is fine, but storing the pointer escapes it. + if (SI->getValueOperand() == U->get()) + return nullptr; + continue; + } else if (isa<ICmpInst>(V)) { + if (NumCmps++) + return nullptr; // Found more than one cmp. + continue; + } else if (auto *Intrin = dyn_cast<IntrinsicInst>(V)) { + switch (Intrin->getIntrinsicID()) { + // These intrinsics don't escape or compare the pointer. Memset is safe + // because we don't allow ptrtoint. Memcpy and memmove are safe because + // we don't allow stores, so src cannot point to V. + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::dbg_declare: case Intrinsic::dbg_value: + case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: + continue; + default: + return nullptr; + } + } else { + return nullptr; + } + for (Use &U : V->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + } + + Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); + return ReplaceInstUsesWith( + ICI, + ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); +} + /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, @@ -851,7 +921,6 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, // to the same result value. HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false); } - } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0. if (CmpRHSV == 0) { // (X / pos) op 0 // Can't overflow. e.g. X/2 op 0 --> [-1, 2) @@ -996,7 +1065,6 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, return Res; } - // If we are comparing against bits always shifted out, the // comparison cannot succeed. APInt Comp = CmpRHSV << ShAmtVal; @@ -1074,18 +1142,22 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A, if (AP1 == AP2) return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); - // Get the distance between the highest bit that's set. int Shift; - // Both the constants are negative, take their positive to calculate log. if (IsAShr && AP1.isNegative()) - // Get the ones' complement of AP2 and AP1 when computing the distance. - Shift = (~AP2).logBase2() - (~AP1).logBase2(); + Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes(); else - Shift = AP2.logBase2() - AP1.logBase2(); + Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros(); if (Shift > 0) { - if (IsAShr ? AP1 == AP2.ashr(Shift) : AP1 == AP2.lshr(Shift)) + if (IsAShr && AP1 == AP2.ashr(Shift)) { + // There are multiple solutions if we are comparing against -1 and the LHS + // of the ashr is not a power of two. + if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) + return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); + return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } else if (AP1 == AP2.lshr(Shift)) { return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } } // Shifting const2 will never be equal to const1. return getConstant(false); @@ -1145,6 +1217,14 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, switch (LHSI->getOpcode()) { case Instruction::Trunc: + if (RHS->isOne() && RHSV.getBitWidth() > 1) { + // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (ICI.getPredicate() == ICmpInst::ICMP_SLT && + match(LHSI->getOperand(0), m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } if (ICI.isEquality() && LHSI->hasOneUse()) { // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all // of the high bits truncated out of x are known. @@ -1447,9 +1527,35 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE, LHSI->getOperand(0), SubOne(RHS)); + + // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1) + // iff C is a power of 2 + if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) { + if (auto *CI = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { + const APInt &AI = CI->getValue(); + int32_t ExactLogBase2 = AI.exactLogBase2(); + if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { + Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1); + Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy); + return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ + ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_SLT, + Trunc, Constant::getNullValue(NTy)); + } + } + } break; case Instruction::Or: { + if (RHS->isOne()) { + // icmp slt signum(V) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (ICI.getPredicate() == ICmpInst::ICMP_SLT && + match(LHSI, m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } + if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse()) break; Value *P, *Q; @@ -2083,11 +2189,9 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // If the pattern matches, truncate the inputs to the narrower type and // use the sadd_with_overflow intrinsic to efficiently compute both the // result and the overflow bit. - Module *M = I.getParent()->getParent()->getParent(); - Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); - Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, - NewType); + Value *F = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::sadd_with_overflow, NewType); InstCombiner::BuilderTy *Builder = IC.Builder; @@ -2123,6 +2227,12 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, return true; }; + // If the overflow check was an add followed by a compare, the insertion point + // may be pointing to the compare. We want to insert the new instructions + // before the add in case there are uses of the add between the add and the + // compare. + Builder->SetInsertPoint(&OrigI); + switch (OCF) { case OCF_INVALID: llvm_unreachable("bad overflow check kind!"); @@ -2223,7 +2333,9 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); - Instruction *MulInstr = cast<Instruction>(MulVal); + auto *MulInstr = dyn_cast<Instruction>(MulVal); + if (!MulInstr) + return nullptr; assert(MulInstr->getOpcode() == Instruction::Mul); auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)), @@ -2357,7 +2469,6 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, InstCombiner::BuilderTy *Builder = IC.Builder; Builder->SetInsertPoint(MulInstr); - Module *M = I.getParent()->getParent()->getParent(); // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) Value *MulA = A, *MulB = B; @@ -2365,8 +2476,8 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulA = Builder->CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder->CreateZExt(B, MulType); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType); + Value *F = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul"); IC.Worklist.Add(MulInstr); @@ -2468,7 +2579,6 @@ static APInt DemandedBitsLHSMask(ICmpInst &I, default: return APInt::getAllOnesValue(BitWidth); } - } /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst @@ -2905,7 +3015,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt::get(X->getType(), CI->countTrailingZeros())); } - break; } case ICmpInst::ICMP_NE: { @@ -2950,7 +3059,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt::get(X->getType(), CI->countTrailingZeros())); } - break; } case ICmpInst::ICMP_ULT: @@ -3103,7 +3211,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // comparison into the select arms, which will cause one to be // constant folded and the select turned into a bitwise or. Value *Op1 = nullptr, *Op2 = nullptr; - ConstantInt *CI = 0; + ConstantInt *CI = nullptr; if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); CI = dyn_cast<ConstantInt>(Op1); @@ -3177,6 +3285,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ICmpInst::getSwappedPredicate(I.getPredicate()), I)) return NI; + // Try to optimize equality comparisons against alloca-based pointers. + if (Op0->getType()->isPointerTy() && I.isEquality()) { + assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL))) + if (Instruction *New = FoldAllocaCmp(I, Alloca, Op1)) + return New; + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL))) + if (Instruction *New = FoldAllocaCmp(I, Alloca, Op0)) + return New; + } + // Test to see if the operands of the icmp are casted versions of other // values. If the ptr->ptr cast can be stripped off both arguments, we do so // now. @@ -3304,6 +3423,26 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { match(B, m_One())) return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); + // icmp sgt X, (Y + -1) -> icmp sge X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); + + // icmp sle X, (Y + -1) -> icmp slt X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); + + // icmp sge X, (Y + 1) -> icmp sgt X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && + match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); + + // icmp slt X, (Y + 1) -> icmp sle X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && + match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); + // if C1 has greater magnitude than C2: // icmp (X + C1), (Y + C2) -> icmp (X + C3), Y // s.t. C3 = C1 - C2 @@ -3473,6 +3612,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } } } + + if (BO0) { + // Transform A & (L - 1) `ult` L --> L != 0 + auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); + auto BitwiseAnd = + m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); + + if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { + auto *Zero = Constant::getNullValue(BO0->getType()); + return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); + } + } } { Value *A, *B; @@ -3697,15 +3848,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); - // Check to see that the input is converted from an integer type that is small - // enough that preserves all bits. TODO: check here for "known" sign bits. - // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. - unsigned InputSize = IntTy->getScalarSizeInBits(); - - // If this is a uitofp instruction, we need an extra bit to hold the sign. bool LHSUnsigned = isa<UIToFPInst>(LHSI); - if (LHSUnsigned) - ++InputSize; if (I.isEquality()) { FCmpInst::Predicate P = I.getPredicate(); @@ -3732,13 +3875,30 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // equality compares as integer? } - // Comparisons with zero are a special case where we know we won't lose - // information. - bool IsCmpZero = RHS.isPosZero(); + // Check to see that the input is converted from an integer type that is small + // enough that preserves all bits. TODO: check here for "known" sign bits. + // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. + unsigned InputSize = IntTy->getScalarSizeInBits(); - // If the conversion would lose info, don't hack on this. - if ((int)InputSize > MantissaWidth && !IsCmpZero) - return nullptr; + // Following test does NOT adjust InputSize downwards for signed inputs, + // because the most negative value still requires all the mantissa bits + // to distinguish it from one less than that value. + if ((int)InputSize > MantissaWidth) { + // Conversion would lose accuracy. Check if loss can impact comparison. + int Exp = ilogb(RHS); + if (Exp == APFloat::IEK_Inf) { + int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics())); + if (MaxExponent < (int)InputSize - !LHSUnsigned) + // Conversion could create infinity. + return nullptr; + } else { + // Note that if RHS is zero or NaN, then Exp is negative + // and first condition is trivially false. + if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned) + // Conversion could affect comparison. + return nullptr; + } + } // Otherwise, we can potentially simplify the comparison. We know that it // will always come through as an integer value and we know the constant is diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ac934f1bd85c..534f67008150 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -281,6 +281,7 @@ public: ICmpInst::Predicate Pred); Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I); + Instruction *FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other); Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1, BinaryOperator &I); Instruction *commonCastTransforms(CastInst &CI); @@ -341,6 +342,7 @@ public: const unsigned SIOpd); private: + bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const; @@ -360,6 +362,11 @@ private: /// \brief Try to optimize a sequence of instructions checking if an operation /// on LHS and RHS overflows. /// + /// If this overflow check is done via one of the overflow check intrinsics, + /// then CtxI has to be the call instruction calling that intrinsic. If this + /// overflow check is done by arithmetic followed by a compare, then CtxI has + /// to be the arithmetic instruction. + /// /// If a simplification is possible, stores the simplified result of the /// operation in OperationResult and result of the overflow check in /// OverflowResult, and return true. If no simplification is possible, @@ -393,7 +400,7 @@ public: assert(New && !New->getParent() && "New instruction already inserted into a basic block!"); BasicBlock *BB = Old.getParent(); - BB->getInstList().insert(&Old, New); // Insert inst + BB->getInstList().insert(Old.getIterator(), New); // Insert inst Worklist.Add(New); return New; } @@ -539,6 +546,7 @@ private: Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); + Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN); Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, BinaryOperator &TheAnd); @@ -548,7 +556,7 @@ private: Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned, bool Inside); Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); - Instruction *MatchBSwap(BinaryOperator &I); + Instruction *MatchBSwapOrBitReverse(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e3179dbeece8..47406b9a1632 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" @@ -90,21 +91,23 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, if (CS.isCallee(&U)) continue; + unsigned DataOpNo = CS.getDataOperandNo(&U); + bool IsArgOperand = CS.isArgOperand(&U); + // Inalloca arguments are clobbered by the call. - unsigned ArgNo = CS.getArgumentNo(&U); - if (CS.isInAllocaArgument(ArgNo)) + if (IsArgOperand && CS.isInAllocaArgument(DataOpNo)) return false; // If this is a readonly/readnone call site, then we know it is just a // load (but one that potentially returns the value itself), so we can // ignore it if we know that the value isn't captured. if (CS.onlyReadsMemory() && - (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo))) continue; // If this is being passed as a byval argument, the caller is making a // copy, so it is only a read of the alloca. - if (CS.isByValArgument(ArgNo)) + if (IsArgOperand && CS.isByValArgument(DataOpNo)) continue; } @@ -186,7 +189,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { // Scan to the end of the allocation instructions, to skip over a block of // allocas if possible...also skip interleaved debug info // - BasicBlock::iterator It = New; + BasicBlock::iterator It(New); while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It; @@ -367,7 +370,13 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT MDB.createRange(NonNullInt, NullInt)); } break; - + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + // These only directly apply if the new type is also a pointer. + if (NewTy->isPointerTy()) + NewLoad->setMetadata(ID, N); + break; case LLVMContext::MD_range: // FIXME: It would be nice to propagate this in some way, but the type // conversions make it hard. If the new type is a pointer, we could @@ -418,6 +427,9 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value case LLVMContext::MD_invariant_load: case LLVMContext::MD_nonnull: case LLVMContext::MD_range: + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: // These don't apply for stores. break; } @@ -511,16 +523,46 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { if (!T->isAggregateType()) return nullptr; - assert(LI.getAlignment() && "Alignement must be set at this point"); + assert(LI.getAlignment() && "Alignment must be set at this point"); if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. - if (ST->getNumElements() == 1) { + unsigned Count = ST->getNumElements(); + if (Count == 1) { LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), ".unpack"); return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue( UndefValue::get(T), NewLoad, 0, LI.getName())); } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return nullptr; + + auto Name = LI.getName(); + SmallString<16> LoadName = Name; + LoadName += ".unpack"; + SmallString<16> EltName = Name; + EltName += ".elt"; + auto *Addr = LI.getPointerOperand(); + Value *V = UndefValue::get(T); + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName); + auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName); + V = IC.Builder->CreateInsertValue(V, L, i); + } + + V->setName(Name); + return IC.ReplaceInstUsesWith(LI, V); } if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -681,7 +723,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, // FIXME: If the GEP is not inbounds, and there are extra indices after the // one we'll replace, those could cause the address computation to wrap // (rendering the IsAllNonNegative() check below insufficient). We can do - // better, ignoring zero indicies (and other indicies we can prove small + // better, ignoring zero indices (and other indices we can prove small // enough not to wrap). if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) return false; @@ -748,19 +790,19 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // Do really simple store-to-load forwarding and load CSE, to catch cases // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. - BasicBlock::iterator BBI = &LI; + BasicBlock::iterator BBI(LI); AAMDNodes AATags; - if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI, - 6, AA, &AATags)) { + if (Value *AvailableVal = + FindAvailableLoadedValue(Op, LI.getParent(), BBI, + DefMaxInstsToScan, AA, &AATags)) { if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) { unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_invariant_load, - LLVMContext::MD_nonnull, - }; + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull, + LLVMContext::MD_invariant_group, LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null}; combineMetadata(NLI, &LI, KnownIDs); }; @@ -822,7 +864,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { } // load (select (cond, null, P)) -> load P - if (isa<ConstantPointerNull>(SI->getOperand(1)) && + if (isa<ConstantPointerNull>(SI->getOperand(1)) && LI.getPointerAddressSpace() == 0) { LI.setOperand(0, SI->getOperand(2)); return &LI; @@ -857,7 +899,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { /// /// \returns true if the store was successfully combined away. This indicates /// the caller must erase the store instruction. We have to let the caller erase -/// the store instruction sas otherwise there is no way to signal whether it was +/// the store instruction as otherwise there is no way to signal whether it was /// combined or not: IC.EraseInstFromFunction returns a null pointer. static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and atomic @@ -893,11 +935,38 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. - if (ST->getNumElements() == 1) { + unsigned Count = ST->getNumElements(); + if (Count == 1) { V = IC.Builder->CreateExtractValue(V, 0); combineStoreToNewValue(IC, SI, V); return true; } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return false; + + SmallString<16> EltName = V->getName(); + EltName += ".elt"; + auto *Addr = SI.getPointerOperand(); + SmallString<16> AddrName = Addr->getName(); + AddrName += ".repack"; + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), AddrName); + auto *Val = IC.Builder->CreateExtractValue(V, i, EltName); + IC.Builder->CreateStore(Val, Ptr); + } + + return true; } if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -971,9 +1040,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return &SI; } - // Don't hack volatile/atomic stores. - // FIXME: Some bits are legal for atomic stores; needs refactoring. - if (!SI.isSimple()) return nullptr; + // Don't hack volatile/ordered stores. + // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. + if (!SI.isUnordered()) return nullptr; // If the RHS is an alloca with a single use, zapify the store, making the // alloca dead. @@ -991,7 +1060,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // Do really simple DSE, to catch cases where there are several consecutive // stores to the same location, separated by a few arithmetic operations. This // situation often occurs with bitfield accesses. - BasicBlock::iterator BBI = &SI; + BasicBlock::iterator BBI(SI); for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts; --ScanInsts) { --BBI; @@ -1005,7 +1074,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) { // Prev store isn't volatile, and stores to the same location? - if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1), + if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1))) { ++NumDeadStore; ++BBI; @@ -1019,9 +1088,10 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // the pointer we're loading and is producing the pointer we're storing, // then *this* store is dead (X = load P; store X -> P). if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { - if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) && - LI->isSimple()) + if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) { + assert(SI.isUnordered() && "can't eliminate ordering operation"); return EraseInstFromFunction(SI); + } // Otherwise, this is a load from some other location. Stores before it // may not be dead. @@ -1047,10 +1117,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (isa<UndefValue>(Val)) return EraseInstFromFunction(SI); + // The code below needs to be audited and adjusted for unordered atomics + if (!SI.isSimple()) + return nullptr; + // If this store is the last instruction in the basic block (possibly // excepting debug info instructions), and if the block ends with an // unconditional branch, try to move it to the successor block. - BBI = &SI; + BBI = SI.getIterator(); do { ++BBI; } while (isa<DbgInfoIntrinsic>(BBI) || @@ -1106,7 +1180,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { return false; // Verify that the other block ends in a branch and is not otherwise empty. - BasicBlock::iterator BBI = OtherBB->getTerminator(); + BasicBlock::iterator BBI(OtherBB->getTerminator()); BranchInst *OtherBr = dyn_cast<BranchInst>(BBI); if (!OtherBr || BBI == OtherBB->begin()) return false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a554e9f628e0..7ad0efc42fb4 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -22,9 +22,9 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// simplifyValueKnownNonZero - The specific integer value is used in a context -/// where it is known to be non-zero. If this allows us to simplify the -/// computation, do so and return the new operand, otherwise return null. +/// The specific integer value is used in a context where it is known to be +/// non-zero. If this allows us to simplify the computation, do so and return +/// the new operand, otherwise return null. static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, Instruction &CxtI) { // If V has multiple uses, then we would have to do more analysis to determine @@ -76,8 +76,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, } -/// MultiplyOverflows - True if the multiply can not be expressed in an int -/// this size. +/// True if the multiply can not be expressed in an int this size. static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product, bool IsSigned) { bool Overflow; @@ -95,6 +94,14 @@ static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, assert(C1.getBitWidth() == C2.getBitWidth() && "Inconsistent width of constants!"); + // Bail if we will divide by zero. + if (C2.isMinValue()) + return false; + + // Bail if we would divide INT_MIN by -1. + if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue()) + return false; + APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned); if (IsSigned) APInt::sdivrem(C1, C2, Quotient, Remainder); @@ -705,8 +712,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { return Changed ? &I : nullptr; } -/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select -/// instruction. +/// Try to fold a divide or remainder of a select instruction. bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { SelectInst *SI = cast<SelectInst>(I.getOperand(1)); @@ -740,7 +746,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { return true; // Scan the current block backward, looking for other uses of SI. - BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin(); + BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin(); while (BBI != BBFront) { --BBI; @@ -754,10 +760,10 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { I != E; ++I) { if (*I == SI) { *I = SI->getOperand(NonNullOperand); - Worklist.Add(BBI); + Worklist.Add(&*BBI); } else if (*I == SelectCond) { *I = Builder->getInt1(NonNullOperand == 1); - Worklist.Add(BBI); + Worklist.Add(&*BBI); } } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 460f6eb6a825..f1aa98b5e359 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "instcombine" @@ -245,7 +246,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { /// non-address-taken alloca. Doing so will cause us to not promote the alloca /// to a register. static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { - BasicBlock::iterator BBI = L, E = L->getParent()->end(); + BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end(); for (++BBI; BBI != E; ++BBI) if (BBI->mayWriteToMemory()) @@ -349,24 +350,40 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { Value *InVal = FirstLI->getOperand(0); NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + LoadInst *NewLI = new LoadInst(NewPN, "", isVolatile, LoadAlignment); + + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_nonnull, + LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null, + }; - // Add all operands to the new PHI. + for (unsigned ID : KnownIDs) + NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + + // Add all operands to the new PHI and combine TBAA metadata. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { - Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0); + LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i)); + combineMetadata(NewLI, LI, KnownIDs); + Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) InVal = nullptr; NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); } - Value *PhiVal; if (InVal) { // The new PHI unions all of the same values together. This is really // common, so we handle it intelligently here for compile-time speed. - PhiVal = InVal; + NewLI->setOperand(0, InVal); delete NewPN; } else { InsertNewInstBefore(NewPN, PN); - PhiVal = NewPN; } // If this was a volatile load that we are merging, make sure to loop through @@ -376,17 +393,94 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { for (Value *IncValue : PN.incoming_values()) cast<LoadInst>(IncValue)->setVolatile(false); - LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment); NewLI->setDebugLoc(FirstLI->getDebugLoc()); return NewLI; } +/// TODO: This function could handle other cast types, but then it might +/// require special-casing a cast from the 'i1' type. See the comment in +/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. +Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (TerminatorInst *TI = Phi.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + + // Early exit for the common case of a phi with two operands. These are + // handled elsewhere. See the comment below where we check the count of zexts + // and constants for more details. + unsigned NumIncomingValues = Phi.getNumIncomingValues(); + if (NumIncomingValues < 3) + return nullptr; + // Find the narrower type specified by the first zext. + Type *NarrowType = nullptr; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + NarrowType = Zext->getSrcTy(); + break; + } + } + if (!NarrowType) + return nullptr; + + // Walk the phi operands checking that we only have zexts or constants that + // we can shrink for free. Store the new operands for the new phi. + SmallVector<Value *, 4> NewIncoming; + unsigned NumZexts = 0; + unsigned NumConsts = 0; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + // All zexts must be identical and have one use. + if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse()) + return nullptr; + NewIncoming.push_back(Zext->getOperand(0)); + NumZexts++; + } else if (auto *C = dyn_cast<Constant>(V)) { + // Make sure that constants can fit in the new type. + Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType); + if (ConstantExpr::getZExt(Trunc, C->getType()) != C) + return nullptr; + NewIncoming.push_back(Trunc); + NumConsts++; + } else { + // If it's not a cast or a constant, bail out. + return nullptr; + } + } + + // The more common cases of a phi with no constant operands or just one + // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi() + // respectively. FoldOpIntoPhi() wants to do the opposite transform that is + // performed here. It tries to replicate a cast in the phi operand's basic + // block to expose other folding opportunities. Thus, InstCombine will + // infinite loop without this check. + if (NumConsts == 0 || NumZexts < 2) + return nullptr; + + // All incoming values are zexts or constants that are safe to truncate. + // Create a new phi node of the narrow type, phi together all of the new + // operands, and zext the result back to the original type. + PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues, + Phi.getName() + ".shrunk"); + for (unsigned i = 0; i != NumIncomingValues; ++i) + NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i)); + + InsertNewInstBefore(NewPhi, Phi); + return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); +} /// If all operands to a PHI node are the same "unary" operator and they all are /// only used by the PHI, PHI together their inputs, and do the operation once, /// to the result of the PHI. Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (TerminatorInst *TI = PN.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); if (isa<GetElementPtrInst>(FirstInst)) @@ -740,7 +834,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { } // Otherwise, do an extract in the predecessor. - Builder->SetInsertPoint(Pred, Pred->getTerminator()); + Builder->SetInsertPoint(Pred->getTerminator()); Value *Res = InVal; if (Offset) Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(), @@ -787,6 +881,9 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC)) return ReplaceInstUsesWith(PN, V); + if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN)) + return Result; + // If all PHI operands are the same operation, pull them through the PHI, // reducing code size. if (isa<Instruction>(PN.getIncomingValue(0)) && diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f51442a9f36d..776704d1efa9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -38,7 +38,8 @@ getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) { } } -static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { +static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF, + bool Ordered=false) { switch (SPF) { default: llvm_unreachable("unhandled!"); @@ -51,17 +52,22 @@ static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { return ICmpInst::ICMP_SGT; case SPF_UMAX: return ICmpInst::ICMP_UGT; + case SPF_FMINNUM: + return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; + case SPF_FMAXNUM: + return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; } } static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder, SelectPatternFlavor SPF, Value *A, Value *B) { - CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); + CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF); + assert(CmpInst::isIntPredicate(Pred)); return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B); } -/// GetSelectFoldableOperands - We want to turn code that looks like this: +/// We want to turn code that looks like this: /// %C = or %A, %B /// %D = select %cond, %C, %A /// into: @@ -90,8 +96,8 @@ static unsigned GetSelectFoldableOperands(Instruction *I) { } } -/// GetSelectFoldableConstant - For the same transformation as the previous -/// function, return the identity constant that goes into the select. +/// For the same transformation as the previous function, return the identity +/// constant that goes into the select. static Constant *GetSelectFoldableConstant(Instruction *I) { switch (I->getOpcode()) { default: llvm_unreachable("This cannot happen!"); @@ -110,7 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) { } } -/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI +/// Here we have (select c, TI, FI), and we know that TI and FI /// have the same opcode and only one use each. Try to simplify this. Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI) { @@ -197,8 +203,8 @@ static bool isSelect01(Constant *C1, Constant *C2) { C2I->isOne() || C2I->isAllOnesValue(); } -/// FoldSelectIntoOp - Try fold the select into one of the operands to -/// facilitate further optimization. +/// Try to fold the select into one of the operands to allow further +/// optimization. Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, Value *FalseVal) { // See the comment above GetSelectFoldableOperands for a description of the @@ -276,7 +282,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, return nullptr; } -/// foldSelectICmpAndOr - We want to turn: +/// We want to turn: /// (select (icmp eq (and X, C1), 0), Y, (or Y, C2)) /// into: /// (or (shl (and X, C1), C3), y) @@ -394,9 +400,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, return nullptr; } -/// visitSelectInstWithICmp - Visit a SelectInst that has an -/// ICmpInst as its first operand. -/// +/// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { bool Changed = false; @@ -595,10 +599,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } -/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a -/// PHI node (but the two may be in different blocks). See if the true/false -/// values (V) are live in all of the predecessor blocks of the PHI. For -/// example, cases like this cannot be mapped: +/// SI is a select whose condition is a PHI node (but the two may be in +/// different blocks). See if the true/false values (V) are live in all of the +/// predecessor blocks of the PHI. For example, cases like this can't be mapped: /// /// X = phi [ C1, BB1], [C2, BB2] /// Y = add @@ -632,7 +635,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V, return false; } -/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form: +/// We have an SPF (e.g. a min or max) of an SPF of the form: /// SPF2(SPF1(A, B), C) Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, @@ -745,10 +748,10 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, return nullptr; } -/// foldSelectICmpAnd - If one of the constants is zero (we know they can't -/// both be) and we have an icmp instruction with zero, and we have an 'and' -/// with the non-constant value and a power of two we can turn the select -/// into a shift on the result of the 'and'. +/// If one of the constants is zero (we know they can't both be) and we have an +/// icmp instruction with zero, and we have an 'and' with the non-constant value +/// and a power of two we can turn the select into a shift on the result of the +/// 'and'. static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, ConstantInt *FalseVal, InstCombiner::BuilderTy *Builder) { @@ -926,6 +929,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + Builder->SetFastMathFlags(FCI->getFastMathFlags()); Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal, FCI->getName() + ".inv"); @@ -967,6 +972,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + Builder->SetFastMathFlags(FCI->getFastMathFlags()); Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal, FCI->getName() + ".inv"); @@ -1054,35 +1061,50 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } // See if we can fold the select into one of our operands. - if (SI.getType()->isIntOrIntVectorTy()) { + if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) { if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal)) return FoldI; Value *LHS, *RHS, *LHS2, *RHS2; Instruction::CastOps CastOp; - SelectPatternFlavor SPF = matchSelectPattern(&SI, LHS, RHS, &CastOp); + SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp); + auto SPF = SPR.Flavor; - if (SPF) { + if (SelectPatternResult::isMinOrMax(SPF)) { // Canonicalize so that type casts are outside select patterns. if (LHS->getType()->getPrimitiveSizeInBits() != SI.getType()->getPrimitiveSizeInBits()) { - CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); - Value *Cmp = Builder->CreateICmp(Pred, LHS, RHS); + CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered); + + Value *Cmp; + if (CmpInst::isIntPredicate(Pred)) { + Cmp = Builder->CreateICmp(Pred, LHS, RHS); + } else { + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); + Builder->SetFastMathFlags(FMF); + Cmp = Builder->CreateFCmp(Pred, LHS, RHS); + } + Value *NewSI = Builder->CreateCast(CastOp, Builder->CreateSelect(Cmp, LHS, RHS), SI.getType()); return ReplaceInstUsesWith(SI, NewSI); } + } + if (SPF) { // MAX(MAX(a, b), a) -> MAX(a, b) // MIN(MIN(a, b), a) -> MIN(a, b) // MAX(MIN(a, b), a) -> a // MIN(MAX(a, b), a) -> a - if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2)) + // ABS(ABS(a)) -> ABS(a) + // NABS(NABS(a)) -> NABS(a) + if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, SI, SPF, RHS)) return R; - if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2)) + if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2, SI, SPF, LHS)) return R; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index d04ed58b014f..0c7defa5fff8 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -55,7 +55,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { return nullptr; } -/// CanEvaluateShifted - See if we can compute the specified value, but shifted +/// See if we can compute the specified value, but shifted /// logically to the left or right by some number of bits. This should return /// true if the expression can be computed for the same cost as the current /// expression tree. This is used to eliminate extraneous shifting from things @@ -184,7 +184,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, } } -/// GetShiftedValue - When CanEvaluateShifted returned true for an expression, +/// When CanEvaluateShifted returned true for an expression, /// this value inserts the new computation that produces the shifted value. static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, InstCombiner &IC, const DataLayout &DL) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 80628b23f111..743d51483ea1 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -410,9 +410,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If this is a select as part of a min/max pattern, don't simplify any // further in case we break the structure. Value *LHS, *RHS; - if (matchSelectPattern(I, LHS, RHS) != SPF_UNKNOWN) + if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; - + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, @@ -1057,7 +1057,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts); if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) { for (unsigned i = 0; i < VWidth; i++) { - if (CV->getAggregateElement(i)->isNullValue()) + Constant *CElt = CV->getAggregateElement(i); + // Method isNullValue always returns false when called on a + // ConstantExpr. If CElt is a ConstantExpr then skip it in order to + // to avoid propagating incorrect information. + if (isa<ConstantExpr>(CElt)) + continue; + if (CElt->isNullValue()) LeftDemanded.clearBit(i); else RightDemanded.clearBit(i); @@ -1082,6 +1088,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (!VTy) break; unsigned InVWidth = VTy->getNumElements(); APInt InputDemandedElts(InVWidth, 0); + UndefElts2 = APInt(InVWidth, 0); unsigned Ratio; if (VWidth == InVWidth) { @@ -1089,29 +1096,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // elements as are demanded of us. Ratio = 1; InputDemandedElts = DemandedElts; - } else if (VWidth > InVWidth) { - // Untested so far. - break; - - // If there are more elements in the result than there are in the source, - // then an input element is live if any of the corresponding output - // elements are live. - Ratio = VWidth/InVWidth; - for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an input element is live if any of the + // corresponding output elements are live. + Ratio = VWidth / InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) if (DemandedElts[OutIdx]) - InputDemandedElts.setBit(OutIdx/Ratio); - } - } else { - // Untested so far. - break; - - // If there are more elements in the source than there are in the result, - // then an input element is live if the corresponding output element is - // live. - Ratio = InVWidth/VWidth; + InputDemandedElts.setBit(OutIdx / Ratio); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an input element is live if the + // corresponding output element is live. + Ratio = InVWidth / VWidth; for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (DemandedElts[InIdx/Ratio]) + if (DemandedElts[InIdx / Ratio]) InputDemandedElts.setBit(InIdx); + } else { + // Unsupported so far. + break; } // div/rem demand all inputs, because they don't want divide by zero. @@ -1122,24 +1125,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, MadeChange = true; } - UndefElts = UndefElts2; - if (VWidth > InVWidth) { - llvm_unreachable("Unimp"); - // If there are more elements in the result than there are in the source, - // then an output element is undef if the corresponding input element is - // undef. + if (VWidth == InVWidth) { + UndefElts = UndefElts2; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an output element is undef if the + // corresponding input element is undef. for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) - if (UndefElts2[OutIdx/Ratio]) + if (UndefElts2[OutIdx / Ratio]) + UndefElts.setBit(OutIdx); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an output element is undef if all of the + // corresponding input elements are undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); + if (SubUndef.countPopulation() == Ratio) UndefElts.setBit(OutIdx); - } else if (VWidth < InVWidth) { + } + } else { llvm_unreachable("Unimp"); - // If there are more elements in the source than there are in the result, - // then a result element is undef if all of the corresponding input - // elements are undef. - UndefElts = ~0ULL >> (64-VWidth); // Start out all undef. - for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (!UndefElts2[InIdx]) // Not undef? - UndefElts.clearBit(InIdx/Ratio); // Clear undef bit. } break; } @@ -1237,6 +1242,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; break; + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); + break; } break; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 273047279e90..e25639ae943b 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -22,10 +22,10 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// CheapToScalarize - Return true if the value is cheaper to scalarize than it -/// is to leave as a vector operation. isConstant indicates whether we're -/// extracting one known element. If false we're extracting a variable index. -static bool CheapToScalarize(Value *V, bool isConstant) { +/// Return true if the value is cheaper to scalarize than it is to leave as a +/// vector operation. isConstant indicates whether we're extracting one known +/// element. If false we're extracting a variable index. +static bool cheapToScalarize(Value *V, bool isConstant) { if (Constant *C = dyn_cast<Constant>(V)) { if (isConstant) return true; @@ -50,13 +50,13 @@ static bool CheapToScalarize(Value *V, bool isConstant) { return true; if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) if (BO->hasOneUse() && - (CheapToScalarize(BO->getOperand(0), isConstant) || - CheapToScalarize(BO->getOperand(1), isConstant))) + (cheapToScalarize(BO->getOperand(0), isConstant) || + cheapToScalarize(BO->getOperand(1), isConstant))) return true; if (CmpInst *CI = dyn_cast<CmpInst>(I)) if (CI->hasOneUse() && - (CheapToScalarize(CI->getOperand(0), isConstant) || - CheapToScalarize(CI->getOperand(1), isConstant))) + (cheapToScalarize(CI->getOperand(0), isConstant) || + cheapToScalarize(CI->getOperand(1), isConstant))) return true; return false; @@ -82,7 +82,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // and that it is a binary operation which is cheap to scalarize. // otherwise return NULL. if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || - !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) + !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true)) return nullptr; // Create a scalar PHI node that will replace the vector PHI node @@ -115,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { Instruction *pos = dyn_cast<Instruction>(PHIInVal); BasicBlock::iterator InsertPos; if (pos && !isa<PHINode>(pos)) { - InsertPos = pos; - ++InsertPos; + InsertPos = ++pos->getIterator(); } else { InsertPos = inBB->getFirstInsertionPt(); } @@ -137,7 +136,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // If vector val is constant with all elements the same, replace EI with // that element. We handle a known element # below. if (Constant *C = dyn_cast<Constant>(EI.getOperand(0))) - if (CheapToScalarize(C, false)) + if (cheapToScalarize(C, false)) return ReplaceInstUsesWith(EI, C->getAggregateElement(0U)); // If extracting a specified index from the vector, see if we can recursively @@ -163,7 +162,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { } } - // If the this extractelement is directly using a bitcast from a vector of + // If this extractelement is directly using a bitcast from a vector of // the same number of elements, see if we can find the source element from // it. In this case, we will end up needing to bitcast the scalars. if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { @@ -184,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) { // Push extractelement into predecessor operation if legal and - // profitable to do so + // profitable to do so. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { if (I->hasOneUse() && - CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { + cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { Value *newEI0 = Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1), EI.getName()+".lhs"); @@ -230,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { SrcIdx, false)); } } else if (CastInst *CI = dyn_cast<CastInst>(I)) { - // Canonicalize extractelement(cast) -> cast(extractelement) - // bitcasts can change the number of vector elements and they cost nothing + // Canonicalize extractelement(cast) -> cast(extractelement). + // Bitcasts can change the number of vector elements, and they cost + // nothing. if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { Value *EE = Builder->CreateExtractElement(CI->getOperand(0), EI.getIndexOperand()); @@ -245,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // fight the vectorizer. // If we are extracting an element from a vector select or a select on - // vectors, a select on the scalars extracted from the vector arguments. + // vectors, create a select on the scalars extracted from the vector + // arguments. Value *TrueVal = SI->getTrueValue(); Value *FalseVal = SI->getFalseValue(); @@ -275,10 +276,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { return nullptr; } -/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns -/// elements from either LHS or RHS, return the shuffle mask and true. -/// Otherwise, return false. -static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, +/// If V is a shuffle of values that ONLY returns elements from either LHS or +/// RHS, return the shuffle mask and true. Otherwise, return false. +static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, SmallVectorImpl<Constant*> &Mask) { assert(LHS->getType() == RHS->getType() && "Invalid CollectSingleShuffleElements"); @@ -315,7 +315,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector. // We can handle this if the vector we are inserting into is // transitively ok. - if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted undef. Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext())); return true; @@ -330,7 +330,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { // We can handle this if the vector we are inserting into is // transitively ok. - if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted value. if (EI->getOperand(0) == LHS) { Mask[InsertedIdx % NumElts] = @@ -352,6 +352,48 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, return false; } +/// If we have insertion into a vector that is wider than the vector that we +/// are extracting from, try to widen the source vector to allow a single +/// shufflevector to replace one or more insert/extract pairs. +static void replaceExtractElements(InsertElementInst *InsElt, + ExtractElementInst *ExtElt, + InstCombiner &IC) { + VectorType *InsVecType = InsElt->getType(); + VectorType *ExtVecType = ExtElt->getVectorOperandType(); + unsigned NumInsElts = InsVecType->getVectorNumElements(); + unsigned NumExtElts = ExtVecType->getVectorNumElements(); + + // The inserted-to vector must be wider than the extracted-from vector. + if (InsVecType->getElementType() != ExtVecType->getElementType() || + NumExtElts >= NumInsElts) + return; + + // Create a shuffle mask to widen the extended-from vector using undefined + // values. The mask selects all of the values of the original vector followed + // by as many undefined values as needed to create a vector of the same length + // as the inserted-to vector. + SmallVector<Constant *, 16> ExtendMask; + IntegerType *IntType = Type::getInt32Ty(InsElt->getContext()); + for (unsigned i = 0; i < NumExtElts; ++i) + ExtendMask.push_back(ConstantInt::get(IntType, i)); + for (unsigned i = NumExtElts; i < NumInsElts; ++i) + ExtendMask.push_back(UndefValue::get(IntType)); + + Value *ExtVecOp = ExtElt->getVectorOperand(); + auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), + ConstantVector::get(ExtendMask)); + + // Replace all extracts from the original narrow vector with extracts from + // the new wide vector. + WideVec->insertBefore(ExtElt); + for (User *U : ExtVecOp->users()) { + if (ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U)) { + auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); + NewExt->insertAfter(WideVec); + IC.ReplaceInstUsesWith(*OldExt, NewExt); + } + } +} /// We are building a shuffle to create V, which is a sequence of insertelement, /// extractelement pairs. If PermittedRHS is set, then we must either use it or @@ -363,9 +405,10 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, /// often been chosen carefully to be efficiently implementable on the target. typedef std::pair<Value *, Value *> ShuffleOps; -static ShuffleOps CollectShuffleElements(Value *V, +static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<Constant *> &Mask, - Value *PermittedRHS) { + Value *PermittedRHS, + InstCombiner &IC) { assert(V->getType()->isVectorTy() && "Invalid shuffle!"); unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); @@ -396,10 +439,14 @@ static ShuffleOps CollectShuffleElements(Value *V, // otherwise we'd end up with a shuffle of three inputs. if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { Value *RHS = EI->getOperand(0); - ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS); + ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC); assert(LR.second == nullptr || LR.second == RHS); if (LR.first->getType() != RHS->getType()) { + // Although we are giving up for now, see if we can create extracts + // that match the inserts for another round of combining. + replaceExtractElements(IEI, EI, IC); + // We tried our best, but we can't find anything compatible with RHS // further up the chain. Return a trivial shuffle. for (unsigned i = 0; i < NumElts; ++i) @@ -429,14 +476,14 @@ static ShuffleOps CollectShuffleElements(Value *V, // If this insertelement is a chain that comes from exactly these two // vectors, return the vector and the effective shuffle. if (EI->getOperand(0)->getType() == PermittedRHS->getType() && - CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, + collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, Mask)) return std::make_pair(EI->getOperand(0), PermittedRHS); } } } - // Otherwise, can't do anything fancy. Return an identity vector. + // Otherwise, we can't do anything fancy. Return an identity vector. for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); return std::make_pair(V, nullptr); @@ -512,7 +559,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { // (and any insertelements it points to), into one big shuffle. if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) { SmallVector<Constant*, 16> Mask; - ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr); + ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); // The proposed shuffle may be trivial, in which case we shouldn't // perform the combine. @@ -588,8 +635,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::GetElementPtr: { - for (int i = 0, e = I->getNumOperands(); i != e; ++i) { - if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1)) + for (Value *Operand : I->operands()) { + if (!CanEvaluateShuffled(Operand, Mask, Depth-1)) return false; } return true; @@ -617,7 +664,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, /// Rebuild a new instruction just like 'I' but with the new operands given. /// In the event of type mismatch, the type of the operands is correct. -static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { +static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) { // We don't want to use the IRBuilder here because we want the replacement // instructions to appear next to 'I', not the builder's insertion point. switch (I->getOpcode()) { @@ -760,7 +807,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { NeedsRebuild |= (V != I->getOperand(i)); } if (NeedsRebuild) { - return BuildNew(I, NewOps); + return buildNew(I, NewOps); } return I; } @@ -792,7 +839,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { llvm_unreachable("failed to reorder elements of vector instruction!"); } -static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, +static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask, bool &isLHSID, bool &isRHSID) { isLHSID = isRHSID = true; @@ -891,7 +938,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (VWidth == LHSWidth) { // Analyze the shuffle, are the LHS or RHS and identity shuffles? bool isLHSID, isRHSID; - RecognizeIdentityMask(Mask, isLHSID, isRHSID); + recognizeIdentityMask(Mask, isLHSID, isRHSID); // Eliminate identity shuffles. if (isLHSID) return ReplaceInstUsesWith(SVI, LHS); @@ -1177,7 +1224,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // If the result mask is an identity, replace uses of this instruction with // corresponding argument. bool isLHSID, isRHSID; - RecognizeIdentityMask(newMask, isLHSID, isRHSID); + recognizeIdentityMask(newMask, isLHSID, isRHSID); if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS); if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fd34a244f271..7c46cfd28fc9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,8 +42,9 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -79,14 +80,12 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) { return llvm::EmitGEPOffset(Builder, DL, GEP); } -/// ShouldChangeType - Return true if it is desirable to convert a computation -/// from 'From' to 'To'. We don't want to convert from a legal to an illegal -/// type for example, or from a smaller to a larger illegal type. -bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { - assert(From->isIntegerTy() && To->isIntegerTy()); - - unsigned FromWidth = From->getPrimitiveSizeInBits(); - unsigned ToWidth = To->getPrimitiveSizeInBits(); +/// Return true if it is desirable to convert an integer computation from a +/// given bit width to a new bit width. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(unsigned FromWidth, + unsigned ToWidth) const { bool FromLegal = DL.isLegalInteger(FromWidth); bool ToLegal = DL.isLegalInteger(ToWidth); @@ -103,6 +102,17 @@ bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { return true; } +/// Return true if it is desirable to convert a computation from 'From' to 'To'. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { + assert(From->isIntegerTy() && To->isIntegerTy()); + + unsigned FromWidth = From->getPrimitiveSizeInBits(); + unsigned ToWidth = To->getPrimitiveSizeInBits(); + return ShouldChangeType(FromWidth, ToWidth); +} + // Return true, if No Signed Wrap should be maintained for I. // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", // where both B and C should be ConstantInts, results in a constant that does @@ -156,27 +166,26 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) { I.setFastMathFlags(FMF); } -/// SimplifyAssociativeOrCommutative - This performs a few simplifications for -/// operators which are associative or commutative: -// -// Commutative operators: -// -// 1. Order operands such that they are listed from right (least complex) to -// left (most complex). This puts constants before unary operators before -// binary operators. -// -// Associative operators: -// -// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. -// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. -// -// Associative and commutative operators: -// -// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. -// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. -// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" -// if C1 and C2 are constants. -// +/// This performs a few simplifications for operators that are associative or +/// commutative: +/// +/// Commutative operators: +/// +/// 1. Order operands such that they are listed from right (least complex) to +/// left (most complex). This puts constants before unary operators before +/// binary operators. +/// +/// Associative operators: +/// +/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. +/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. +/// +/// Associative and commutative operators: +/// +/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. +/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. +/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" +/// if C1 and C2 are constants. bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Instruction::BinaryOps Opcode = I.getOpcode(); bool Changed = false; @@ -322,7 +331,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { } while (1); } -/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to +/// Return whether "X LOp (Y ROp Z)" is always equal to /// "(X LOp Y) ROp (X LOp Z)". static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, Instruction::BinaryOps ROp) { @@ -361,7 +370,7 @@ static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, } } -/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to +/// Return whether "(X LOp Y) ROp Z" is always equal to /// "(X ROp Z) LOp (Y ROp Z)". static bool RightDistributesOverLeft(Instruction::BinaryOps LOp, Instruction::BinaryOps ROp) { @@ -519,7 +528,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, if (isa<OverflowingBinaryOperator>(Op1)) HasNSW &= Op1->hasNoSignedWrap(); - // We can propogate 'nsw' if we know that + // We can propagate 'nsw' if we know that // %Y = mul nsw i16 %X, C // %Z = add nsw i16 %Y, %X // => @@ -537,11 +546,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, return SimplifiedInst; } -/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations -/// which some other binary operation distributes over either by factorizing -/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this -/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is -/// a win). Returns the simplified value, or null if it didn't simplify. +/// This tries to simplify binary operations which some other binary operation +/// distributes over either by factorizing out common terms +/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in +/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). +/// Returns the simplified value, or null if it didn't simplify. Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); @@ -623,12 +632,38 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { } } + // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0)) + // (op (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (op c, d))) + if (auto *SI0 = dyn_cast<SelectInst>(LHS)) { + if (auto *SI1 = dyn_cast<SelectInst>(RHS)) { + if (SI0->getCondition() == SI1->getCondition()) { + Value *SI = nullptr; + if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(), + SI1->getFalseValue(), DL, TLI, DT, AC)) + SI = Builder->CreateSelect(SI0->getCondition(), + Builder->CreateBinOp(TopLevelOpcode, + SI0->getTrueValue(), + SI1->getTrueValue()), + V); + if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(), + SI1->getTrueValue(), DL, TLI, DT, AC)) + SI = Builder->CreateSelect( + SI0->getCondition(), V, + Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(), + SI1->getFalseValue())); + if (SI) { + SI->takeName(&I); + return SI; + } + } + } + } + return nullptr; } -// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction -// if the LHS is a constant zero (which is the 'negate' form). -// +/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a +/// constant zero (which is the 'negate' form). Value *InstCombiner::dyn_castNegVal(Value *V) const { if (BinaryOperator::isNeg(V)) return BinaryOperator::getNegArgument(V); @@ -644,10 +679,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const { return nullptr; } -// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the -// instruction if the LHS is a constant negative zero (which is the 'negate' -// form). -// +/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is +/// a constant negative zero (which is the 'negate' form). Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const { if (BinaryOperator::isFNeg(V, IgnoreZeroSign)) return BinaryOperator::getFNegArgument(V); @@ -700,10 +733,10 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, llvm_unreachable("Unknown binary instruction type!"); } -// FoldOpIntoSelect - Given an instruction with a select as one operand and a -// constant as the other operand, try to fold the binary operator into the -// select arguments. This also works for Cast instructions, which obviously do -// not have a second operand. +/// Given an instruction with a select as one operand and a constant as the +/// other operand, try to fold the binary operator into the select arguments. +/// This also works for Cast instructions, which obviously do not have a second +/// operand. Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { // Don't modify shared select instructions if (!SI->hasOneUse()) return nullptr; @@ -752,10 +785,9 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { return nullptr; } -/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which -/// has a PHI node as operand #0, see if we can fold the instruction into the -/// PHI (which is only possible if all operands to the PHI are constants). -/// +/// Given a binary operator, cast instruction, or select which has a PHI node as +/// operand #0, see if we can fold the instruction into the PHI (which is only +/// possible if all operands to the PHI are constants). Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PHINode *PN = cast<PHINode>(I.getOperand(0)); unsigned NumPHIValues = PN->getNumIncomingValues(); @@ -819,7 +851,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { NewPN->takeName(PN); // If we are going to have to insert a new computation, do so right before the - // predecessors terminator. + // predecessor's terminator. if (NonConstBB) Builder->SetInsertPoint(NonConstBB->getTerminator()); @@ -893,10 +925,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { return ReplaceInstUsesWith(I, NewPN); } -/// FindElementAtOffset - Given a pointer type and a constant offset, determine -/// whether or not there is a sequence of GEP indices into the pointed type that -/// will land us at the specified offset. If so, fill them into NewIndices and -/// return the resultant element type, otherwise return null. +/// Given a pointer type and a constant offset, determine whether or not there +/// is a sequence of GEP indices into the pointed type that will land us at the +/// specified offset. If so, fill them into NewIndices and return the resultant +/// element type, otherwise return null. Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, SmallVectorImpl<Value *> &NewIndices) { Type *Ty = PtrTy->getElementType(); @@ -965,8 +997,8 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) { return true; } -/// Descale - Return a value X such that Val = X * Scale, or null if none. If -/// the multiplication is known not to overflow then NoSignedWrap is set. +/// Return a value X such that Val = X * Scale, or null if none. +/// If the multiplication is known not to overflow, then NoSignedWrap is set. Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!"); assert(cast<IntegerType>(Val->getType())->getBitWidth() == @@ -1008,11 +1040,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // 0'th operand of Val. std::pair<Instruction*, unsigned> Parent; - // RequireNoSignedWrap - Set if the transform requires a descaling at deeper - // levels that doesn't overflow. + // Set if the transform requires a descaling at deeper levels that doesn't + // overflow. bool RequireNoSignedWrap = false; - // logScale - log base 2 of the scale. Negative if not a power of 2. + // Log base 2 of the scale. Negative if not a power of 2. int32_t logScale = Scale.exactLogBase2(); for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down @@ -1213,16 +1245,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { /// specified one but with other operands. static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS, InstCombiner::BuilderTy *B) { - Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); - if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) { - if (isa<OverflowingBinaryOperator>(NewBO)) { - NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap()); - NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap()); - } - if (isa<PossiblyExactOperator>(NewBO)) - NewBO->setIsExact(Inst.isExact()); - } - return BORes; + Value *BO = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); + // If LHS and RHS are constant, BO won't be a binary operator. + if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO)) + NewBO->copyIRFlags(&Inst); + return BO; } /// \brief Makes transformation of binary operation specific for vector types. @@ -1256,9 +1283,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { LShuf->getMask() == RShuf->getMask()) { Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0), RShuf->getOperand(0), Builder); - Value *Res = Builder->CreateShuffleVector(NewBO, + return Builder->CreateShuffleVector(NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask()); - return Res; } } @@ -1294,18 +1320,11 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { } if (MayChange) { Constant *C2 = ConstantVector::get(C2M); - Value *NewLHS, *NewRHS; - if (isa<Constant>(LHS)) { - NewLHS = C2; - NewRHS = Shuffle->getOperand(0); - } else { - NewLHS = Shuffle->getOperand(0); - NewRHS = C2; - } + Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0); + Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2; Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder); - Value *Res = Builder->CreateShuffleVector(NewBO, + return Builder->CreateShuffleVector(NewBO, UndefValue::get(Inst.getType()), Shuffle->getMask()); - return Res; } } @@ -1323,7 +1342,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. bool MadeChange = false; - Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType()); + Type *IntPtrTy = + DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType()); gep_type_iterator GTI = gep_type_begin(GEP); for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; @@ -1333,21 +1353,25 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!SeqTy) continue; + // Index type should have the same width as IntPtr + Type *IndexTy = (*I)->getType(); + Type *NewIndexType = IndexTy->isVectorTy() ? + VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy; + // If the element type has zero size then any index over it is equivalent // to an index of zero, so replace it with zero if it is not zero already. if (SeqTy->getElementType()->isSized() && DL.getTypeAllocSize(SeqTy->getElementType()) == 0) if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { - *I = Constant::getNullValue(IntPtrTy); + *I = Constant::getNullValue(NewIndexType); MadeChange = true; } - Type *IndexTy = (*I)->getType(); - if (IndexTy != IntPtrTy) { + if (IndexTy != NewIndexType) { // If we are using a wider index than needed for this platform, shrink // it to what we need. If narrower, sign-extend it to what we need. // This explicit cast can make subsequent optimizations more obvious. - *I = Builder->CreateIntCast(*I, IntPtrTy, true); + *I = Builder->CreateIntCast(*I, NewIndexType, true); MadeChange = true; } } @@ -1421,8 +1445,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } - GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + // If not all GEPs are identical we'll have to create a new PHI node. + // Check that the old PHI node has only one use so that it will get + // removed. + if (DI != -1 && !PN->hasOneUse()) + return nullptr; + GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. @@ -1432,11 +1461,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // All the GEPs feeding the PHI differ at a single offset. Clone a GEP // into the current block so it can be merged, and create a new PHI to // set that index. - Instruction *InsertPt = Builder->GetInsertPoint(); - Builder->SetInsertPoint(PN); - PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), - PN->getNumOperands()); - Builder->SetInsertPoint(InsertPt); + PHINode *NewPN; + { + IRBuilderBase::InsertPointGuard Guard(*Builder); + Builder->SetInsertPoint(PN); + NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + } for (auto &I : PN->operands()) NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), @@ -1790,7 +1821,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { I->takeName(BCI); - BCI->getParent()->getInstList().insert(BCI, I); + BCI->getParent()->getInstList().insert(BCI->getIterator(), I); ReplaceInstUsesWith(*BCI, I); } return &GEP; @@ -1931,7 +1962,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { // Replace invoke with a NOP intrinsic to maintain the original CFG - Module *M = II->getParent()->getParent()->getParent(); + Module *M = II->getModule(); Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), None, "", II->getParent()); @@ -2280,9 +2311,10 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { } if (LoadInst *L = dyn_cast<LoadInst>(Agg)) // If the (non-volatile) load only has one use, we can rewrite this to a - // load from a GEP. This reduces the size of the load. - // FIXME: If a load is used only by extractvalue instructions then this - // could be done regardless of having multiple uses. + // load from a GEP. This reduces the size of the load. If a load is used + // only by extractvalue instructions then this either must have been + // optimized before, or it is a struct with padding, in which case we + // don't want to do the transformation as it loses padding knowledge. if (L->isSimple() && L->hasOneUse()) { // extractvalue has integer indices, getelementptr has Value*s. Convert. SmallVector<Value*, 4> Indices; @@ -2294,7 +2326,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // We need to insert these at the location of the old load, not at that of // the extractvalue. - Builder->SetInsertPoint(L->getParent(), L); + Builder->SetInsertPoint(L); Value *GEP = Builder->CreateInBoundsGEP(L->getType(), L->getPointerOperand(), Indices); // Returning the load directly will cause the main loop to insert it in @@ -2312,7 +2344,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { return nullptr; } -/// isCatchAll - Return 'true' if the given typeinfo will match anything. +/// Return 'true' if the given typeinfo will match anything. static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { switch (Personality) { case EHPersonality::GNU_C: @@ -2330,6 +2362,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { case EHPersonality::MSVC_X86SEH: case EHPersonality::MSVC_Win64SEH: case EHPersonality::MSVC_CXX: + case EHPersonality::CoreCLR: return TypeInfo->isNullValue(); } llvm_unreachable("invalid enum"); @@ -2441,10 +2474,24 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { SawCatchAll = true; break; } - if (AlreadyCaught.count(TypeInfo)) - // Already caught by an earlier clause, so having it in the filter - // is pointless. - continue; + + // Even if we've seen a type in a catch clause, we don't want to + // remove it from the filter. An unexpected type handler may be + // set up for a call site which throws an exception of the same + // type caught. In order for the exception thrown by the unexpected + // handler to propogate correctly, the filter must be correctly + // described for the call site. + // + // Example: + // + // void unexpected() { throw 1;} + // void foo() throw (int) { + // std::set_unexpected(unexpected); + // try { + // throw 2.0; + // } catch (int i) {} + // } + // There is no point in having multiple copies of the same typeinfo in // a filter, so only add it if we didn't already. if (SeenInFilter.insert(TypeInfo).second) @@ -2637,15 +2684,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } -/// TryToSinkInstruction - Try to move the specified instruction from its -/// current block into the beginning of DestBlock, which can only happen if it's -/// safe to move the instruction past all of the instructions between it and the -/// end of its block. +/// Try to move the specified instruction from its current block into the +/// beginning of DestBlock, which can only happen if it's safe to move the +/// instruction past all of the instructions between it and the end of its +/// block. static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { assert(I->hasOneUse() && "Invariants didn't hold!"); // Cannot move control-flow-involving, volatile loads, vaarg, etc. - if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() || + if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || isa<TerminatorInst>(I)) return false; @@ -2654,17 +2701,24 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { &DestBlock->getParent()->getEntryBlock()) return false; + // Do not sink convergent call instructions. + if (auto *CI = dyn_cast<CallInst>(I)) { + if (CI->isConvergent()) + return false; + } + // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { - for (BasicBlock::iterator Scan = I, E = I->getParent()->end(); + for (BasicBlock::iterator Scan = I->getIterator(), + E = I->getParent()->end(); Scan != E; ++Scan) if (Scan->mayWriteToMemory()) return false; } BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt(); - I->moveBefore(InsertPos); + I->moveBefore(&*InsertPos); ++NumSunkInst; return true; } @@ -2698,6 +2752,27 @@ bool InstCombiner::run() { } } + // In general, it is possible for computeKnownBits to determine all bits in a + // value even when the operands are not all constants. + if (!I->use_empty() && I->getType()->isIntegerTy()) { + unsigned BitWidth = I->getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I); + if ((KnownZero | KnownOne).isAllOnesValue()) { + Constant *C = ConstantInt::get(I->getContext(), KnownOne); + DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << + " from: " << *I << '\n'); + + // Add operands to the worklist. + ReplaceInstUsesWith(*I, C); + ++NumConstProp; + EraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + } + // See if we can trivially sink this instruction to a successor basic block. if (I->hasOneUse()) { BasicBlock *BB = I->getParent(); @@ -2738,7 +2813,7 @@ bool InstCombiner::run() { } // Now that we have an instruction, try combining it to simplify it. - Builder->SetInsertPoint(I->getParent(), I); + Builder->SetInsertPoint(I); Builder->SetCurrentDebugLocation(I->getDebugLoc()); #ifndef NDEBUG @@ -2768,7 +2843,7 @@ bool InstCombiner::run() { // Insert the new instruction into the basic block... BasicBlock *InstParent = I->getParent(); - BasicBlock::iterator InsertPos = I; + BasicBlock::iterator InsertPos = I->getIterator(); // If we replace a PHI with something that isn't a PHI, fix up the // insertion point. @@ -2801,8 +2876,8 @@ bool InstCombiner::run() { return MadeIRChange; } -/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding -/// all reachable code to the worklist. +/// Walk the function in depth-first order, adding all reachable code to the +/// worklist. /// /// This has a couple of tricks to make the code faster and more powerful. In /// particular, we constant fold and DCE instructions as we go, to avoid adding @@ -2829,7 +2904,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, continue; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { - Instruction *Inst = BBI++; + Instruction *Inst = &*BBI++; // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst, TLI)) { @@ -2900,8 +2975,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, } } - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - Worklist.push_back(TI->getSuccessor(i)); + for (BasicBlock *SuccBB : TI->successors()) + Worklist.push_back(SuccBB); } while (!Worklist.empty()); // Once we've found all of the instructions to add to instcombine's worklist, @@ -2909,8 +2984,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, // of the function down. This jives well with the way that it adds all uses // of instructions to the worklist after doing a transformation, thus avoiding // some N^2 behavior in pathological cases. - ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], - InstrsForInstCombineWorklist.size()); + ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist); return MadeIRChange; } @@ -2930,13 +3004,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, // track of which blocks we visit. SmallPtrSet<BasicBlock *, 64> Visited; MadeIRChange |= - AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI); + AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI); // Do a quick scan over the function. If we find any blocks that are // unreachable, remove any instructions inside of them. This prevents // the instcombine code from having to deal with some bad special cases. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (Visited.count(BB)) + if (Visited.count(&*BB)) continue; // Delete the instructions backwards, as it has a reduced likelihood of @@ -2944,11 +3018,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. while (EndInst != BB->begin()) { // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; - if (!Inst->use_empty()) + Instruction *Inst = &*--EndInst->getIterator(); + if (!Inst->use_empty() && !Inst->getType()->isTokenTy()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { + if (Inst->isEHPad()) { EndInst = Inst; continue; } @@ -2956,7 +3029,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, ++NumDeadInst; MadeIRChange = true; } - Inst->eraseFromParent(); + if (!Inst->getType()->isTokenTy()) + Inst->eraseFromParent(); } } @@ -2968,8 +3042,6 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, LoopInfo *LI = nullptr) { - // Minimizing size? - bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); auto &DL = F.getParent()->getDataLayout(); /// Builder - This is an IRBuilder that automatically inserts new @@ -2992,7 +3064,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) Changed = true; - InstCombiner IC(Worklist, &Builder, MinimizeSize, + InstCombiner IC(Worklist, &Builder, F.optForMinSize(), AA, &AC, &TLI, &DT, DL, LI); if (IC.run()) Changed = true; @@ -3046,11 +3118,12 @@ public: void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } bool InstructionCombiningPass::runOnFunction(Function &F) { @@ -3058,7 +3131,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { return false; // Required analyses. - auto AA = &getAnalysis<AliasAnalysis>(); + auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -3076,7 +3149,8 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index e7ef9f96edc2..a9df5e5898ae 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -90,7 +91,9 @@ static const char *const kAsanUnregisterGlobalsName = "__asan_unregister_globals"; static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *const kAsanInitName = "__asan_init_v5"; +static const char *const kAsanInitName = "__asan_init"; +static const char *const kAsanVersionCheckName = + "__asan_version_mismatch_check_v6"; static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp"; static const char *const kAsanPtrSub = "__sanitizer_ptr_sub"; static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; @@ -119,6 +122,10 @@ static const unsigned kAllocaRzSize = 32; static cl::opt<bool> ClEnableKasan( "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClRecover( + "asan-recover", + cl::desc("Enable recovery mode (continue-after-error)."), + cl::Hidden, cl::init(false)); // This flag may need to be replaced with -f[no-]asan-reads. static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", @@ -177,7 +184,7 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix( cl::init("__asan_")); static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas", cl::desc("instrument dynamic allocas"), - cl::Hidden, cl::init(false)); + cl::Hidden, cl::init(true)); static cl::opt<bool> ClSkipPromotableAllocas( "asan-skip-promotable-allocas", cl::desc("Do not instrument promotable allocas"), cl::Hidden, @@ -273,6 +280,11 @@ class GlobalsMetadata { GlobalsMetadata() : inited_(false) {} + void reset() { + inited_ = false; + Entries.clear(); + } + void init(Module &M) { assert(!inited_); inited_ = true; @@ -321,7 +333,7 @@ struct ShadowMapping { static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, bool IsKasan) { - bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android; + bool IsAndroid = TargetTriple.isAndroid(); bool IsIOS = TargetTriple.isiOS(); bool IsFreeBSD = TargetTriple.isOSFreeBSD(); bool IsLinux = TargetTriple.isOSLinux(); @@ -338,6 +350,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, ShadowMapping Mapping; if (LongSize == 32) { + // Android is always PIE, which means that the beginning of the address + // space is always available. if (IsAndroid) Mapping.Offset = 0; else if (IsMIPS32) @@ -376,7 +390,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, // OR-ing shadow offset if more efficient (at least on x86) if the offset // is a power of two, but on ppc64 we have to use add since the shadow // offset is not necessary 1/8-th of the address space. - Mapping.OrShadowOffset = !IsPPC64 && !(Mapping.Offset & (Mapping.Offset - 1)); + Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 + && !(Mapping.Offset & (Mapping.Offset - 1)); return Mapping; } @@ -389,8 +404,9 @@ static size_t RedzoneSizeForScale(int MappingScale) { /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer : public FunctionPass { - explicit AddressSanitizer(bool CompileKernel = false) - : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan) { + explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false) + : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan), + Recover(Recover || ClRecover) { initializeAddressSanitizerPass(*PassRegistry::getPassRegistry()); } const char *getPassName() const override { @@ -437,7 +453,9 @@ struct AddressSanitizer : public FunctionPass { Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool runOnFunction(Function &F) override; bool maybeInsertAsanInitAtFunctionEntry(Function &F); + void markEscapedLocalAllocas(Function &F); bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; static char ID; // Pass identification, replacement for typeid DominatorTree &getDominatorTree() const { return *DT; } @@ -450,10 +468,21 @@ struct AddressSanitizer : public FunctionPass { bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr, uint64_t TypeSize) const; + /// Helper to cleanup per-function state. + struct FunctionStateRAII { + AddressSanitizer *Pass; + FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) { + assert(Pass->ProcessedAllocas.empty() && + "last pass forgot to clear cache"); + } + ~FunctionStateRAII() { Pass->ProcessedAllocas.clear(); } + }; + LLVMContext *C; Triple TargetTriple; int LongSize; bool CompileKernel; + bool Recover; Type *IntptrTy; ShadowMapping Mapping; DominatorTree *DT; @@ -477,8 +506,10 @@ struct AddressSanitizer : public FunctionPass { class AddressSanitizerModule : public ModulePass { public: - explicit AddressSanitizerModule(bool CompileKernel = false) - : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan) {} + explicit AddressSanitizerModule(bool CompileKernel = false, + bool Recover = false) + : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan), + Recover(Recover || ClRecover) {} bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid const char *getPassName() const override { return "AddressSanitizerModule"; } @@ -496,6 +527,7 @@ class AddressSanitizerModule : public ModulePass { GlobalsMetadata GlobalsMD; bool CompileKernel; + bool Recover; Type *IntptrTy; LLVMContext *C; Triple TargetTriple; @@ -525,6 +557,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { ShadowMapping Mapping; SmallVector<AllocaInst *, 16> AllocaVec; + SmallSetVector<AllocaInst *, 16> NonInstrumentedStaticAllocaVec; SmallVector<Instruction *, 8> RetVec; unsigned StackAlignment; @@ -545,12 +578,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { SmallVector<AllocaInst *, 1> DynamicAllocaVec; SmallVector<IntrinsicInst *, 1> StackRestoreVec; AllocaInst *DynamicAllocaLayout = nullptr; + IntrinsicInst *LocalEscapeCall = nullptr; // Maps Value to an AllocaInst from which the Value is originated. typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy; AllocaForValueMapTy AllocaForValue; - bool HasNonEmptyInlineAsm; + bool HasNonEmptyInlineAsm = false; + bool HasReturnsTwiceCall = false; std::unique_ptr<CallInst> EmptyInlineAsm; FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) @@ -562,7 +597,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping), StackAlignment(1 << Mapping.Scale), - HasNonEmptyInlineAsm(false), EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {} bool runOnFunction() { @@ -596,9 +630,24 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore, Value *SavedStack) { IRBuilder<> IRB(InstBefore); + Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy); + // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we + // need to adjust extracted SP to compute the address of the most recent + // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for + // this purpose. + if (!isa<ReturnInst>(InstBefore)) { + Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration( + InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, + {IntptrTy}); + + Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {}); + + DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy), + DynamicAreaOffset); + } + IRB.CreateCall(AsanAllocasUnpoisonFunc, - {IRB.CreateLoad(DynamicAllocaLayout), - IRB.CreatePtrToInt(SavedStack, IntptrTy)}); + {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr}); } // Unpoison dynamic allocas redzones. @@ -625,7 +674,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { /// \brief Collect Alloca instructions we want (and can) handle. void visitAllocaInst(AllocaInst &AI) { - if (!ASan.isInterestingAlloca(AI)) return; + if (!ASan.isInterestingAlloca(AI)) { + if (AI.isStaticAlloca()) NonInstrumentedStaticAllocaVec.insert(&AI); + return; + } StackAlignment = std::max(StackAlignment, AI.getAlignment()); if (ASan.isDynamicAlloca(AI)) @@ -639,6 +691,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void visitIntrinsicInst(IntrinsicInst &II) { Intrinsic::ID ID = II.getIntrinsicID(); if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II); + if (ID == Intrinsic::localescape) LocalEscapeCall = &II; if (!ClCheckLifetime) return; if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end) return; @@ -660,9 +713,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { AllocaPoisonCallVec.push_back(APC); } - void visitCallInst(CallInst &CI) { - HasNonEmptyInlineAsm |= - CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get()); + void visitCallSite(CallSite CS) { + Instruction *I = CS.getInstruction(); + if (CallInst *CI = dyn_cast<CallInst>(I)) { + HasNonEmptyInlineAsm |= + CI->isInlineAsm() && !CI->isIdenticalTo(EmptyInlineAsm.get()); + HasReturnsTwiceCall |= CI->canReturnTwice(); + } } // ---------------------- Helpers. @@ -689,7 +746,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { Instruction *ThenTerm, Value *ValueIfFalse); }; -} // namespace +} // anonymous namespace char AddressSanitizer::ID = 0; INITIALIZE_PASS_BEGIN( @@ -697,12 +754,15 @@ INITIALIZE_PASS_BEGIN( "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( AddressSanitizer, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) -FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel) { - return new AddressSanitizer(CompileKernel); +FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel, + bool Recover) { + assert(!CompileKernel || Recover); + return new AddressSanitizer(CompileKernel, Recover); } char AddressSanitizerModule::ID = 0; @@ -711,8 +771,10 @@ INITIALIZE_PASS( "AddressSanitizer: detects use-after-free and out-of-bounds bugs." "ModulePass", false, false) -ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel) { - return new AddressSanitizerModule(CompileKernel); +ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel, + bool Recover) { + assert(!CompileKernel || Recover); + return new AddressSanitizerModule(CompileKernel, Recover); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -799,8 +861,10 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) { getAllocaSizeInBytes(&AI) > 0 && // We are only interested in allocas not promotable to registers. // Promotable allocas are common under -O0. - (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI) || - isDynamicAlloca(AI))); + (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) && + // inalloca allocas are not treated as static, and we don't want + // dynamic alloca instrumentation for them as well. + !AI.isUsedWithInAlloca()); ProcessedAllocas[&AI] = IsInteresting; return IsInteresting; @@ -868,10 +932,8 @@ static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) { } else { return false; } - if (!isPointerOperand(I->getOperand(0)) || - !isPointerOperand(I->getOperand(1))) - return false; - return true; + return isPointerOperand(I->getOperand(0)) && + isPointerOperand(I->getOperand(1)); } bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { @@ -919,7 +981,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL)); - if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) && + if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) && isSafeAccess(ObjSizeVis, Addr, TypeSize)) { NumOptimizedAccessesToGlobalVar++; return; @@ -1041,13 +1103,17 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); - BasicBlock *CrashBlock = + if (Recover) { + CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false); + } else { + BasicBlock *CrashBlock = BasicBlock::Create(*C, "", NextBB->getParent(), NextBB); - CrashTerm = new UnreachableInst(*C, CrashBlock); - BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); - ReplaceInstWithInst(CheckTerm, NewTerm); + CrashTerm = new UnreachableInst(*C, CrashBlock); + BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); + ReplaceInstWithInst(CheckTerm, NewTerm); + } } else { - CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true); + CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover); } Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite, @@ -1084,7 +1150,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment( void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName) { // Set up the arguments to our poison/unpoison functions. - IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt()); + IRBuilder<> IRB(&GlobalInit.front(), + GlobalInit.front().getFirstInsertionPt()); // Add a call to poison all external globals before the given function starts. Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy); @@ -1147,6 +1214,14 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { // Do not instrument globals from special LLVM sections. if (Section.find("__llvm") != StringRef::npos) return false; + // Do not instrument function pointers to initialization and termination + // routines: dynamic linker will not properly handle redzones. + if (Section.startswith(".preinit_array") || + Section.startswith(".init_array") || + Section.startswith(".fini_array")) { + return false; + } + // Callbacks put into the CRT initializer/terminator sections // should not be instrumented. // See https://code.google.com/p/address-sanitizer/issues/detail?id=305 @@ -1162,10 +1237,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { bool TAAParsed; std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier( Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize); - if (!ErrorCode.empty()) { - assert(false && "Invalid section specifier."); - return false; - } + assert(ErrorCode.empty() && "Invalid section specifier."); // Ignore the globals from the __OBJC section. The ObjC runtime assumes // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to @@ -1383,13 +1455,11 @@ void AddressSanitizer::initializeCallbacks(Module &M) { const std::string TypeStr = AccessIsWrite ? "store" : "load"; const std::string ExpStr = Exp ? "exp_" : ""; const std::string SuffixStr = CompileKernel ? "N" : "_n"; - const std::string EndingStr = CompileKernel ? "_noabort" : ""; - const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr; - // TODO(glider): for KASan builds add _noabort to error reporting - // functions and make them actually noabort (remove the UnreachableInst). + const std::string EndingStr = Recover ? "_noabort" : ""; + Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr; AsanErrorCallbackSized[AccessIsWrite][Exp] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( - kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr, + kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr, IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr)); AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( @@ -1400,7 +1470,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) { const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex); AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( - kAsanReportErrorTemplate + ExpStr + Suffix, + kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr, IRB.getVoidTy(), IntptrTy, ExpType, nullptr)); AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( @@ -1448,15 +1518,20 @@ bool AddressSanitizer::doInitialization(Module &M) { if (!CompileKernel) { std::tie(AsanCtorFunction, AsanInitFunction) = - createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, kAsanInitName, - /*InitArgTypes=*/{}, - /*InitArgs=*/{}); + createSanitizerCtorAndInitFunctions( + M, kAsanModuleCtorName, kAsanInitName, + /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName); appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority); } Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel); return true; } +bool AddressSanitizer::doFinalization(Module &M) { + GlobalsMD.reset(); + return false; +} + bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // For each NSObject descendant having a +load method, this method is invoked // by the ObjC runtime before any of the static constructors is called. @@ -1466,13 +1541,41 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // We cannot just ignore these methods, because they may call other // instrumented functions. if (F.getName().find(" load]") != std::string::npos) { - IRBuilder<> IRB(F.begin()->begin()); + IRBuilder<> IRB(&F.front(), F.front().begin()); IRB.CreateCall(AsanInitFunction, {}); return true; } return false; } +void AddressSanitizer::markEscapedLocalAllocas(Function &F) { + // Find the one possible call to llvm.localescape and pre-mark allocas passed + // to it as uninteresting. This assumes we haven't started processing allocas + // yet. This check is done up front because iterating the use list in + // isInterestingAlloca would be algorithmically slower. + assert(ProcessedAllocas.empty() && "must process localescape before allocas"); + + // Try to get the declaration of llvm.localescape. If it's not in the module, + // we can exit early. + if (!F.getParent()->getFunction("llvm.localescape")) return; + + // Look for a call to llvm.localescape call in the entry block. It can't be in + // any other block. + for (Instruction &I : F.getEntryBlock()) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (II && II->getIntrinsicID() == Intrinsic::localescape) { + // We found a call. Mark all the allocas passed in as uninteresting. + for (Value *Arg : II->arg_operands()) { + AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts()); + assert(AI && AI->isStaticAlloca() && + "non-static alloca arg to localescape"); + ProcessedAllocas[AI] = false; + } + break; + } + } +} + bool AddressSanitizer::runOnFunction(Function &F) { if (&F == AsanCtorFunction) return false; if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; @@ -1488,6 +1591,12 @@ bool AddressSanitizer::runOnFunction(Function &F) { if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false; + FunctionStateRAII CleanupObj(this); + + // We can't instrument allocas used with llvm.localescape. Only static allocas + // can be passed to that intrinsic. + markEscapedLocalAllocas(F); + // We want to instrument every address only once per basic block (unless there // are calls between uses). SmallSet<Value *, 16> TempsToInstrument; @@ -1715,6 +1824,16 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() { void FunctionStackPoisoner::poisonStack() { assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0); + // Insert poison calls for lifetime intrinsics for alloca. + bool HavePoisonedAllocas = false; + for (const auto &APC : AllocaPoisonCallVec) { + assert(APC.InsBefore); + assert(APC.AI); + IRBuilder<> IRB(APC.InsBefore); + poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); + HavePoisonedAllocas |= APC.DoPoison; + } + if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) { // Handle dynamic allocas. createDynamicAllocasInitStorage(); @@ -1723,7 +1842,7 @@ void FunctionStackPoisoner::poisonStack() { unpoisonDynamicAllocas(); } - if (AllocaVec.size() == 0) return; + if (AllocaVec.empty()) return; int StackMallocIdx = -1; DebugLoc EntryDebugLocation; @@ -1734,6 +1853,19 @@ void FunctionStackPoisoner::poisonStack() { IRBuilder<> IRB(InsBefore); IRB.SetCurrentDebugLocation(EntryDebugLocation); + // Make sure non-instrumented allocas stay in the entry block. Otherwise, + // debug info is broken, because only entry-block allocas are treated as + // regular stack slots. + auto InsBeforeB = InsBefore->getParent(); + assert(InsBeforeB == &F.getEntryBlock()); + for (BasicBlock::iterator I(InsBefore); I != InsBeforeB->end(); ++I) + if (auto *AI = dyn_cast<AllocaInst>(I)) + if (NonInstrumentedStaticAllocaVec.count(AI) > 0) + AI->moveBefore(InsBefore); + + // If we have a call to llvm.localescape, keep it in the entry block. + if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore); + SmallVector<ASanStackVariableDescription, 16> SVD; SVD.reserve(AllocaVec.size()); for (AllocaInst *AI : AllocaVec) { @@ -1751,10 +1883,15 @@ void FunctionStackPoisoner::poisonStack() { uint64_t LocalStackSize = L.FrameSize; bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel && LocalStackSize <= kMaxStackMallocSize; - // Don't do dynamic alloca or stack malloc in presence of inline asm: - // too often it makes assumptions on which registers are available. - bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm; - DoStackMalloc &= !HasNonEmptyInlineAsm; + bool DoDynamicAlloca = ClDynamicAllocaStack; + // Don't do dynamic alloca or stack malloc if: + // 1) There is inline asm: too often it makes assumptions on which registers + // are available. + // 2) There is a returns_twice call (typically setjmp), which is + // optimization-hostile, and doesn't play well with introduced indirect + // register-relative calculation of local variable addresses. + DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall; + DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall; Value *StaticAlloca = DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false); @@ -1804,16 +1941,6 @@ void FunctionStackPoisoner::poisonStack() { DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca; } - // Insert poison calls for lifetime intrinsics for alloca. - bool HavePoisonedAllocas = false; - for (const auto &APC : AllocaPoisonCallVec) { - assert(APC.InsBefore); - assert(APC.AI); - IRBuilder<> IRB(APC.InsBefore); - poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); - HavePoisonedAllocas |= APC.DoPoison; - } - // Replace Alloca instructions with base+offset. for (const auto &Desc : SVD) { AllocaInst *AI = Desc.AI; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index f6858034d79e..fd3dfd9af033 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -106,7 +106,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) { } ++ChecksAdded; - Instruction *Inst = Builder->GetInsertPoint(); + BasicBlock::iterator Inst = Builder->GetInsertPoint(); BasicBlock *OldBB = Inst->getParent(); BasicBlock *Cont = OldBB->splitBasicBlock(Inst); OldBB->getTerminator()->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h new file mode 100644 index 000000000000..c47fdbf68996 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h @@ -0,0 +1,217 @@ +//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a Union-find algorithm to compute Minimum Spanning Tree +// for a given CFG. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <string> +#include <utility> +#include <vector> + +namespace llvm { + +#define DEBUG_TYPE "cfgmst" + +/// \brief An union-find based Minimum Spanning Tree for CFG +/// +/// Implements a Union-find algorithm to compute Minimum Spanning Tree +/// for a given CFG. +template <class Edge, class BBInfo> class CFGMST { +public: + Function &F; + + // Store all the edges in CFG. It may contain some stale edges + // when Removed is set. + std::vector<std::unique_ptr<Edge>> AllEdges; + + // This map records the auxiliary information for each BB. + DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos; + + // Find the root group of the G and compress the path from G to the root. + BBInfo *findAndCompressGroup(BBInfo *G) { + if (G->Group != G) + G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group)); + return static_cast<BBInfo *>(G->Group); + } + + // Union BB1 and BB2 into the same group and return true. + // Returns false if BB1 and BB2 are already in the same group. + bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) { + BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1)); + BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2)); + + if (BB1G == BB2G) + return false; + + // Make the smaller rank tree a direct child or the root of high rank tree. + if (BB1G->Rank < BB2G->Rank) + BB1G->Group = BB2G; + else { + BB2G->Group = BB1G; + // If the ranks are the same, increment root of one tree by one. + if (BB1G->Rank == BB2G->Rank) + BB1G->Rank++; + } + return true; + } + + // Give BB, return the auxiliary information. + BBInfo &getBBInfo(const BasicBlock *BB) const { + auto It = BBInfos.find(BB); + assert(It->second.get() != nullptr); + return *It->second.get(); + } + + // Traverse the CFG using a stack. Find all the edges and assign the weight. + // Edges with large weight will be put into MST first so they are less likely + // to be instrumented. + void buildEdges() { + DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n"); + + const BasicBlock *BB = &(F.getEntryBlock()); + uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2); + // Add a fake edge to the entry. + addEdge(nullptr, BB, EntryWeight); + + // Special handling for single BB functions. + if (succ_empty(BB)) { + addEdge(BB, nullptr, EntryWeight); + return; + } + + static const uint32_t CriticalEdgeMultiplier = 1000; + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + TerminatorInst *TI = BB->getTerminator(); + uint64_t BBWeight = + (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2); + uint64_t Weight = 2; + if (int successors = TI->getNumSuccessors()) { + for (int i = 0; i != successors; ++i) { + BasicBlock *TargetBB = TI->getSuccessor(i); + bool Critical = isCriticalEdge(TI, i); + uint64_t scaleFactor = BBWeight; + if (Critical) { + if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier) + scaleFactor *= CriticalEdgeMultiplier; + else + scaleFactor = UINT64_MAX; + } + if (BPI != nullptr) + Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor); + addEdge(&*BB, TargetBB, Weight).IsCritical = Critical; + DEBUG(dbgs() << " Edge: from " << BB->getName() << " to " + << TargetBB->getName() << " w=" << Weight << "\n"); + } + } else { + addEdge(&*BB, nullptr, BBWeight); + DEBUG(dbgs() << " Edge: from " << BB->getName() << " to exit" + << " w = " << BBWeight << "\n"); + } + } + } + + // Sort CFG edges based on its weight. + void sortEdgesByWeight() { + std::stable_sort(AllEdges.begin(), AllEdges.end(), + [](const std::unique_ptr<Edge> &Edge1, + const std::unique_ptr<Edge> &Edge2) { + return Edge1->Weight > Edge2->Weight; + }); + } + + // Traverse all the edges and compute the Minimum Weight Spanning Tree + // using union-find algorithm. + void computeMinimumSpanningTree() { + // First, put all the critical edge with landing-pad as the Dest to MST. + // This works around the insufficient support of critical edges split + // when destination BB is a landing pad. + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + if (Ei->IsCritical) { + if (Ei->DestBB && Ei->DestBB->isLandingPad()) { + if (unionGroups(Ei->SrcBB, Ei->DestBB)) + Ei->InMST = true; + } + } + } + + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + if (unionGroups(Ei->SrcBB, Ei->DestBB)) + Ei->InMST = true; + } + } + + // Dump the Debug information about the instrumentation. + void dumpEdges(raw_ostream &OS, const Twine &Message) const { + if (!Message.str().empty()) + OS << Message << "\n"; + OS << " Number of Basic Blocks: " << BBInfos.size() << "\n"; + for (auto &BI : BBInfos) { + const BasicBlock *BB = BI.first; + OS << " BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << " " + << BI.second->infoString() << "\n"; + } + + OS << " Number of Edges: " << AllEdges.size() + << " (*: Instrument, C: CriticalEdge, -: Removed)\n"; + uint32_t Count = 0; + for (auto &EI : AllEdges) + OS << " Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->" + << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n"; + } + + // Add an edge to AllEdges with weight W. + Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) { + uint32_t Index = BBInfos.size(); + auto Iter = BBInfos.end(); + bool Inserted; + std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr)); + if (Inserted) { + // Newly inserted, update the real info. + Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); + Index++; + } + std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr)); + if (Inserted) + // Newly inserted, update the real info. + Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); + AllEdges.emplace_back(new Edge(Src, Dest, W)); + return *AllEdges.back(); + } + + BranchProbabilityInfo *BPI; + BlockFrequencyInfo *BFI; + +public: + CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr, + BlockFrequencyInfo *BFI_ = nullptr) + : F(Func), BPI(BPI_), BFI(BFI_) { + buildEdges(); + sortEdgesByWeight(); + computeMinimumSpanningTree(); + } +}; + +#undef DEBUG_TYPE // "cfgmst" +} // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 2de6e1afaba9..d459fc50d136 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -72,6 +72,11 @@ using namespace llvm; +// External symbol to be used when generating the shadow address for +// architectures with multiple VMAs. Instead of using a constant integer +// the runtime will set the external mask based on the VMA range. +static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask"; + // The -dfsan-preserve-alignment flag controls whether this pass assumes that // alignment requirements provided by the input IR are correct. For example, // if the input IR contains a load with alignment 8, this flag will cause @@ -124,6 +129,7 @@ static cl::opt<bool> ClDebugNonzeroLabels( "load or return with a nonzero label"), cl::Hidden); + namespace { StringRef GetGlobalTypeString(const GlobalValue &G) { @@ -231,6 +237,7 @@ class DataFlowSanitizer : public ModulePass { void *(*GetRetvalTLSPtr)(); Constant *GetArgTLS; Constant *GetRetvalTLS; + Constant *ExternalShadowMask; FunctionType *DFSanUnionFnTy; FunctionType *DFSanUnionLoadFnTy; FunctionType *DFSanUnimplementedFnTy; @@ -248,7 +255,7 @@ class DataFlowSanitizer : public ModulePass { DFSanABIList ABIList; DenseMap<Value *, Function *> UnwrappedFnMap; AttributeSet ReadOnlyNoneAttrs; - DenseMap<const Function *, DISubprogram *> FunctionDIs; + bool DFSanRuntimeShadowMask; Value *getShadowAddress(Value *Addr, Instruction *Pos); bool isInstrumented(const Function *F); @@ -362,7 +369,8 @@ llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles, DataFlowSanitizer::DataFlowSanitizer( const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(), void *(*getRetValTLS)()) - : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { + : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), + DFSanRuntimeShadowMask(false) { std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), ClABIListFiles.end()); @@ -420,6 +428,8 @@ bool DataFlowSanitizer::doInitialization(Module &M) { bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el; + bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 || + TargetTriple.getArch() == llvm::Triple::aarch64_be; const DataLayout &DL = M.getDataLayout(); @@ -434,6 +444,9 @@ bool DataFlowSanitizer::doInitialization(Module &M) { ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); else if (IsMIPS64) ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); + // AArch64 supports multiple VMAs and the shadow mask is set at runtime. + else if (IsAArch64) + DFSanRuntimeShadowMask = true; else report_fatal_error("unsupported triple"); @@ -578,7 +591,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true); Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) - DFSF.ValShadowMap[ValAI] = ShadowAI; + DFSF.ValShadowMap[&*ValAI] = &*ShadowAI; DFSanVisitor(DFSF).visitCallInst(*CI); if (!FT->getReturnType()->isVoidTy()) new StoreInst(DFSF.getShadow(RI->getReturnValue()), @@ -592,8 +605,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) { if (ABIList.isIn(M, "skip")) return false; - FunctionDIs = makeSubprogramMap(M); - if (!GetArgTLSPtr) { Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); @@ -606,6 +617,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) { G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); } + ExternalShadowMask = + Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy); + DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy); if (Function *F = dyn_cast<Function>(DFSanUnionFn)) { F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); @@ -643,16 +657,16 @@ bool DataFlowSanitizer::runOnModule(Module &M) { std::vector<Function *> FnsToInstrument; llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI; - for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { - if (!i->isIntrinsic() && - i != DFSanUnionFn && - i != DFSanCheckedUnionFn && - i != DFSanUnionLoadFn && - i != DFSanUnimplementedFn && - i != DFSanSetLabelFn && - i != DFSanNonzeroLabelFn && - i != DFSanVarargWrapperFn) - FnsToInstrument.push_back(&*i); + for (Function &i : M) { + if (!i.isIntrinsic() && + &i != DFSanUnionFn && + &i != DFSanCheckedUnionFn && + &i != DFSanUnionLoadFn && + &i != DFSanUnimplementedFn && + &i != DFSanSetLabelFn && + &i != DFSanNonzeroLabelFn && + &i != DFSanVarargWrapperFn) + FnsToInstrument.push_back(&i); } // Give function aliases prefixes when necessary, and build wrappers where the @@ -710,7 +724,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { NewFArg = NewF->arg_begin(), FArgEnd = F.arg_end(); FArg != FArgEnd; ++FArg, ++NewFArg) { - FArg->replaceAllUsesWith(NewFArg); + FArg->replaceAllUsesWith(&*NewFArg); } NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); @@ -750,11 +764,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) { ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)); F.replaceAllUsesWith(WrappedFnCst); - // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(&F); - if (DI != FunctionDIs.end()) - DI->second->replaceFunction(&F); - UnwrappedFnMap[WrappedFnCst] = &F; *i = NewF; @@ -842,7 +851,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { if (Instruction *I = dyn_cast<Instruction>(V)) Pos = I->getNextNode(); else - Pos = DFSF.F->getEntryBlock().begin(); + Pos = &DFSF.F->getEntryBlock().front(); while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos)) Pos = Pos->getNextNode(); IRBuilder<> IRB(Pos); @@ -864,7 +873,7 @@ Value *DFSanFunction::getArgTLSPtr() { if (DFS.ArgTLS) return ArgTLSPtr = DFS.ArgTLS; - IRBuilder<> IRB(F->getEntryBlock().begin()); + IRBuilder<> IRB(&F->getEntryBlock().front()); return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {}); } @@ -874,7 +883,7 @@ Value *DFSanFunction::getRetvalTLS() { if (DFS.RetvalTLS) return RetvalTLSPtr = DFS.RetvalTLS; - IRBuilder<> IRB(F->getEntryBlock().begin()); + IRBuilder<> IRB(&F->getEntryBlock().front()); return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {}); } @@ -906,7 +915,7 @@ Value *DFSanFunction::getShadow(Value *V) { Function::arg_iterator i = F->arg_begin(); while (ArgIdx--) ++i; - Shadow = i; + Shadow = &*i; assert(Shadow->getType() == DFS.ShadowTy); break; } @@ -928,9 +937,15 @@ void DFSanFunction::setShadow(Instruction *I, Value *Shadow) { Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { assert(Addr != RetvalTLS && "Reinstrumenting?"); IRBuilder<> IRB(Pos); + Value *ShadowPtrMaskValue; + if (DFSanRuntimeShadowMask) + ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask); + else + ShadowPtrMaskValue = ShadowPtrMask; return IRB.CreateIntToPtr( IRB.CreateMul( - IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask), + IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), + IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)), ShadowPtrMul), ShadowPtrTy); } @@ -991,7 +1006,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { Call->addAttribute(2, Attribute::ZExt); BasicBlock *Tail = BI->getSuccessor(0); - PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin()); + PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front()); Phi->addIncoming(Call, Call->getParent()); Phi->addIncoming(V1, Head); @@ -1105,7 +1120,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow); BasicBlock *Head = Pos->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(Pos); + BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator()); if (DomTreeNode *OldNode = DT.getNode(Head)) { std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); @@ -1475,8 +1490,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) { if (FT->isVarArg()) { auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy, CS.arg_size() - FT->getNumParams()); - auto *LabelVAAlloca = new AllocaInst(LabelVATy, "labelva", - DFSF.F->getEntryBlock().begin()); + auto *LabelVAAlloca = new AllocaInst( + LabelVATy, "labelva", &DFSF.F->getEntryBlock().front()); for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) { auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n); @@ -1490,7 +1505,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { if (!DFSF.LabelReturnAlloca) { DFSF.LabelReturnAlloca = new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn", - DFSF.F->getEntryBlock().begin()); + &DFSF.F->getEntryBlock().front()); } Args.push_back(DFSF.LabelReturnAlloca); } @@ -1529,13 +1544,14 @@ void DFSanVisitor::visitCallSite(CallSite CS) { if (!CS.getType()->isVoidTy()) { if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { if (II->getNormalDest()->getSinglePredecessor()) { - Next = II->getNormalDest()->begin(); + Next = &II->getNormalDest()->front(); } else { BasicBlock *NewBB = SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT); - Next = NewBB->begin(); + Next = &NewBB->front(); } } else { + assert(CS->getIterator() != CS->getParent()->end()); Next = CS->getNextNode(); } @@ -1568,7 +1584,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { unsigned VarArgSize = CS.arg_size() - FT->getNumParams(); ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize); AllocaInst *VarArgShadow = - new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin()); + new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front()); Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0)); for (unsigned n = 0; i != e; ++i, ++n) { IRB.CreateStore( diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 9a3ed5c04efc..fa939aee252a 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -138,6 +138,7 @@ namespace { Module *M; LLVMContext *Ctx; SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; + DenseMap<DISubprogram *, Function *> FnMap; }; } @@ -309,13 +310,12 @@ namespace { // object users can construct, the blocks and lines will be rooted here. class GCOVFunction : public GCOVRecord { public: - GCOVFunction(const DISubprogram *SP, raw_ostream *os, uint32_t Ident, - bool UseCfgChecksum, bool ExitBlockBeforeBody) + GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os, + uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody) : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0), ReturnBlock(1, os) { this->os = os; - Function *F = SP->getFunction(); DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); uint32_t i = 0; @@ -347,8 +347,8 @@ namespace { std::string EdgeDestinations; raw_string_ostream EDOS(EdgeDestinations); Function *F = Blocks.begin()->first->getParent(); - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - GCOVBlock &Block = getBlock(I); + for (BasicBlock &I : *F) { + GCOVBlock &Block = getBlock(&I); for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) EDOS << Block.OutEdges[i]->Number; } @@ -389,8 +389,8 @@ namespace { // Emit edges between blocks. if (Blocks.empty()) return; Function *F = Blocks.begin()->first->getParent(); - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - GCOVBlock &Block = getBlock(I); + for (BasicBlock &I : *F) { + GCOVBlock &Block = getBlock(&I); if (Block.OutEdges.empty()) continue; writeBytes(EdgeTag, 4); @@ -405,9 +405,8 @@ namespace { } // Emit lines for each block. - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - getBlock(I).writeOut(); - } + for (BasicBlock &I : *F) + getBlock(&I).writeOut(); } private: @@ -451,6 +450,12 @@ bool GCOVProfiler::runOnModule(Module &M) { this->M = &M; Ctx = &M.getContext(); + FnMap.clear(); + for (Function &F : M) { + if (DISubprogram *SP = F.getSubprogram()) + FnMap[SP] = &F; + } + if (Options.EmitNotes) emitProfileNotes(); if (Options.EmitData) return emitProfileArcs(); return false; @@ -495,7 +500,7 @@ void GCOVProfiler::emitProfileNotes() { unsigned FunctionIdent = 0; for (auto *SP : CU->getSubprograms()) { - Function *F = SP->getFunction(); + Function *F = FnMap[SP]; if (!F) continue; if (!functionHasLines(F)) continue; @@ -507,13 +512,13 @@ void GCOVProfiler::emitProfileNotes() { ++It; EntryBlock.splitBasicBlock(It); - Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++, + Funcs.push_back(make_unique<GCOVFunction>(SP, F, &out, FunctionIdent++, Options.UseCfgChecksum, Options.ExitBlockBeforeBody)); GCOVFunction &Func = *Funcs.back(); for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - GCOVBlock &Block = Func.getBlock(BB); + GCOVBlock &Block = Func.getBlock(&*BB); TerminatorInst *TI = BB->getTerminator(); if (int successors = TI->getNumSuccessors()) { for (int i = 0; i != successors; ++i) { @@ -574,7 +579,7 @@ bool GCOVProfiler::emitProfileArcs() { auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i)); SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP; for (auto *SP : CU->getSubprograms()) { - Function *F = SP->getFunction(); + Function *F = FnMap[SP]; if (!F) continue; if (!functionHasLines(F)) continue; if (!Result) Result = true; @@ -605,7 +610,7 @@ bool GCOVProfiler::emitProfileArcs() { int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors(); if (Successors) { if (Successors == 1) { - IRBuilder<> Builder(BB->getFirstInsertionPt()); + IRBuilder<> Builder(&*BB->getFirstInsertionPt()); Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Edge); Value *Count = Builder.CreateLoad(Counter); @@ -625,7 +630,7 @@ bool GCOVProfiler::emitProfileArcs() { Count = Builder.CreateAdd(Count, Builder.getInt64(1)); Builder.CreateStore(Count, Counter); } else { - ComplexEdgePreds.insert(BB); + ComplexEdgePreds.insert(&*BB); for (int i = 0; i != Successors; ++i) ComplexEdgeSuccs.insert(TI->getSuccessor(i)); } @@ -641,13 +646,13 @@ bool GCOVProfiler::emitProfileArcs() { GlobalVariable *EdgeState = getEdgeStateValue(); for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { - IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt()); + IRBuilder<> Builder(&*ComplexEdgePreds[i + 1]->getFirstInsertionPt()); Builder.CreateStore(Builder.getInt32(i), EdgeState); } for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) { // Call runtime to perform increment. - IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt()); + IRBuilder<> Builder(&*ComplexEdgeSuccs[i + 1]->getFirstInsertionPt()); Value *CounterPtrArray = Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, i * ComplexEdgePreds.size()); @@ -731,8 +736,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable( IRBuilder<> Builder(Succ); Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Edge + i); - EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) + - (Preds.idFor(BB)-1)] = cast<Constant>(Counter); + EdgeTable[((Succs.idFor(Succ) - 1) * Preds.size()) + + (Preds.idFor(&*BB) - 1)] = cast<Constant>(Counter); } } Edge += Successors; @@ -901,7 +906,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() { // uint32_t pred = *predecessor; // if (pred == 0xffffffff) return; - Argument *Arg = Fn->arg_begin(); + Argument *Arg = &*Fn->arg_begin(); Arg->setName("predecessor"); Value *Pred = Builder.CreateLoad(Arg, "pred"); Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff)); @@ -912,7 +917,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() { // uint64_t *counter = counters[pred]; // if (!counter) return; Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty()); - Arg = std::next(Fn->arg_begin()); + Arg = &*std::next(Fn->arg_begin()); Arg->setName("counters"); Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred); Value *Counter = Builder.CreateLoad(GEP, "counter"); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 712bf8edc7ea..92e41ee27c09 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -7,18 +7,18 @@ // //===----------------------------------------------------------------------===// // -// This pass lowers instrprof_increment intrinsics emitted by a frontend for -// profiling. It also builds the data structures and initialization code needed -// for updating execution counts and emitting the profile at runtime. +// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling. +// It also builds the data structures and initialization code needed for +// updating execution counts and emitting the profile at runtime. // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Instrumentation.h" - #include "llvm/ADT/Triple.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; @@ -49,7 +49,15 @@ public: private: InstrProfOptions Options; Module *M; - DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters; + typedef struct PerFunctionProfileData { + uint32_t NumValueSites[IPVK_Last+1]; + GlobalVariable* RegionCounters; + GlobalVariable* DataVar; + PerFunctionProfileData() : RegionCounters(nullptr), DataVar(nullptr) { + memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last+1)); + } + } PerFunctionProfileData; + DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap; std::vector<Value *> UsedVars; bool isMachO() const { @@ -58,24 +66,30 @@ private: /// Get the section name for the counter variables. StringRef getCountersSection() const { - return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts"; + return getInstrProfCountersSectionName(isMachO()); } /// Get the section name for the name variables. StringRef getNameSection() const { - return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names"; + return getInstrProfNameSectionName(isMachO()); } /// Get the section name for the profile data variables. StringRef getDataSection() const { - return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data"; + return getInstrProfDataSectionName(isMachO()); } /// Get the section name for the coverage mapping data. StringRef getCoverageSection() const { - return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap"; + return getInstrProfCoverageSectionName(isMachO()); } + /// Count the number of instrumented value sites for the function. + void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins); + + /// Replace instrprof_value_profile with a call to runtime library. + void lowerValueProfileInst(InstrProfValueProfileInst *Ins); + /// Replace instrprof_increment with an increment of the appropriate value. void lowerIncrement(InstrProfIncrementInst *Inc); @@ -117,20 +131,37 @@ bool InstrProfiling::runOnModule(Module &M) { bool MadeChange = false; this->M = &M; - RegionCounters.clear(); + ProfileDataMap.clear(); UsedVars.clear(); + // We did not know how many value sites there would be inside + // the instrumented function. This is counting the number of instrumented + // target value sites to enter it as field in the profile data variable. for (Function &F : M) for (BasicBlock &BB : F) for (auto I = BB.begin(), E = BB.end(); I != E;) - if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) { + if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I++)) + computeNumValueSiteCounts(Ind); + + for (Function &F : M) + for (BasicBlock &BB : F) + for (auto I = BB.begin(), E = BB.end(); I != E;) { + auto Instr = I++; + if (auto *Inc = dyn_cast<InstrProfIncrementInst>(Instr)) { lowerIncrement(Inc); MadeChange = true; + } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { + lowerValueProfileInst(Ind); + MadeChange = true; } - if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) { + } + + if (GlobalVariable *Coverage = + M.getNamedGlobal(getCoverageMappingVarName())) { lowerCoverageData(Coverage); MadeChange = true; } + if (!MadeChange) return false; @@ -141,10 +172,59 @@ bool InstrProfiling::runOnModule(Module &M) { return true; } +static Constant *getOrInsertValueProfilingCall(Module &M) { + LLVMContext &Ctx = M.getContext(); + auto *ReturnTy = Type::getVoidTy(M.getContext()); + Type *ParamTypes[] = { +#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType +#include "llvm/ProfileData/InstrProfData.inc" + }; + auto *ValueProfilingCallTy = + FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false); + return M.getOrInsertFunction(getInstrProfValueProfFuncName(), + ValueProfilingCallTy); +} + +void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { + + GlobalVariable *Name = Ind->getName(); + uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); + uint64_t Index = Ind->getIndex()->getZExtValue(); + auto It = ProfileDataMap.find(Name); + if (It == ProfileDataMap.end()) { + PerFunctionProfileData PD; + PD.NumValueSites[ValueKind] = Index + 1; + ProfileDataMap[Name] = PD; + } else if (It->second.NumValueSites[ValueKind] <= Index) + It->second.NumValueSites[ValueKind] = Index + 1; +} + +void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { + + GlobalVariable *Name = Ind->getName(); + auto It = ProfileDataMap.find(Name); + assert(It != ProfileDataMap.end() && It->second.DataVar && + "value profiling detected in function with no counter incerement"); + + GlobalVariable *DataVar = It->second.DataVar; + uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); + uint64_t Index = Ind->getIndex()->getZExtValue(); + for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind) + Index += It->second.NumValueSites[Kind]; + + IRBuilder<> Builder(Ind); + Value* Args[3] = {Ind->getTargetValue(), + Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), + Builder.getInt32(Index)}; + Ind->replaceAllUsesWith( + Builder.CreateCall(getOrInsertValueProfilingCall(*M), Args)); + Ind->eraseFromParent(); +} + void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { GlobalVariable *Counters = getOrCreateRegionCounters(Inc); - IRBuilder<> Builder(Inc->getParent(), *Inc); + IRBuilder<> Builder(Inc); uint64_t Index = Inc->getIndex()->getZExtValue(); Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); Value *Count = Builder.CreateLoad(Addr, "pgocount"); @@ -172,9 +252,10 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) { GlobalVariable *Name = cast<GlobalVariable>(V); // If we have region counters for this name, we've already handled it. - auto It = RegionCounters.find(Name); - if (It != RegionCounters.end()) - continue; + auto It = ProfileDataMap.find(Name); + if (It != ProfileDataMap.end()) + if (It->second.RegionCounters) + continue; // Move the name variable to the right section. Name->setSection(getNameSection()); @@ -183,69 +264,108 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) { } /// Get the name of a profiling variable for a particular function. -static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) { - auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer()); - StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString(); - return ("__llvm_profile_" + VarName + "_" + Name).str(); +static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) { + StringRef NamePrefix = getInstrProfNameVarPrefix(); + StringRef Name = Inc->getName()->getName().substr(NamePrefix.size()); + return (Prefix + Name).str(); +} + +static inline bool shouldRecordFunctionAddr(Function *F) { + // Check the linkage + if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() && + !F->hasAvailableExternallyLinkage()) + return true; + // Check uses of this function for other than direct calls or invokes to it. + return F->hasAddressTaken(); +} + +static inline Comdat *getOrCreateProfileComdat(Module &M, + InstrProfIncrementInst *Inc) { + // COFF format requires a COMDAT section to have a key symbol with the same + // name. The linker targeting COFF also requires that the COMDAT section + // a section is associated to must precede the associating section. For this + // reason, we must choose the name var's name as the name of the comdat. + StringRef ComdatPrefix = (Triple(M.getTargetTriple()).isOSBinFormatCOFF() + ? getInstrProfNameVarPrefix() + : getInstrProfComdatPrefix()); + return M.getOrInsertComdat(StringRef(getVarName(Inc, ComdatPrefix))); } GlobalVariable * InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { - GlobalVariable *Name = Inc->getName(); - auto It = RegionCounters.find(Name); - if (It != RegionCounters.end()) - return It->second; - - // Move the name variable to the right section. Make sure it is placed in the - // same comdat as its associated function. Otherwise, we may get multiple - // counters for the same function in certain cases. + GlobalVariable *NamePtr = Inc->getName(); + auto It = ProfileDataMap.find(NamePtr); + PerFunctionProfileData PD; + if (It != ProfileDataMap.end()) { + if (It->second.RegionCounters) + return It->second.RegionCounters; + PD = It->second; + } + + // Move the name variable to the right section. Place them in a COMDAT group + // if the associated function is a COMDAT. This will make sure that + // only one copy of counters of the COMDAT function will be emitted after + // linking. Function *Fn = Inc->getParent()->getParent(); - Name->setSection(getNameSection()); - Name->setAlignment(1); - Name->setComdat(Fn->getComdat()); + Comdat *ProfileVarsComdat = nullptr; + if (Fn->hasComdat()) + ProfileVarsComdat = getOrCreateProfileComdat(*M, Inc); + NamePtr->setSection(getNameSection()); + NamePtr->setAlignment(1); + NamePtr->setComdat(ProfileVarsComdat); uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); LLVMContext &Ctx = M->getContext(); ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters); // Create the counters variable. - auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(), - Constant::getNullValue(CounterTy), - getVarName(Inc, "counters")); - Counters->setVisibility(Name->getVisibility()); - Counters->setSection(getCountersSection()); - Counters->setAlignment(8); - Counters->setComdat(Fn->getComdat()); - - RegionCounters[Inc->getName()] = Counters; + auto *CounterPtr = + new GlobalVariable(*M, CounterTy, false, NamePtr->getLinkage(), + Constant::getNullValue(CounterTy), + getVarName(Inc, getInstrProfCountersVarPrefix())); + CounterPtr->setVisibility(NamePtr->getVisibility()); + CounterPtr->setSection(getCountersSection()); + CounterPtr->setAlignment(8); + CounterPtr->setComdat(ProfileVarsComdat); // Create data variable. - auto *NameArrayTy = Name->getType()->getPointerElementType(); - auto *Int32Ty = Type::getInt32Ty(Ctx); - auto *Int64Ty = Type::getInt64Ty(Ctx); auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); - auto *Int64PtrTy = Type::getInt64PtrTy(Ctx); - - Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy}; + auto *Int16Ty = Type::getInt16Ty(Ctx); + auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last+1); + Type *DataTypes[] = { + #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, + #include "llvm/ProfileData/InstrProfData.inc" + }; auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes)); + + Constant *FunctionAddr = shouldRecordFunctionAddr(Fn) ? + ConstantExpr::getBitCast(Fn, Int8PtrTy) : + ConstantPointerNull::get(Int8PtrTy); + + Constant *Int16ArrayVals[IPVK_Last+1]; + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) + Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + Constant *DataVals[] = { - ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()), - ConstantInt::get(Int32Ty, NumCounters), - ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()), - ConstantExpr::getBitCast(Name, Int8PtrTy), - ConstantExpr::getBitCast(Counters, Int64PtrTy)}; - auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(), + #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init, + #include "llvm/ProfileData/InstrProfData.inc" + }; + auto *Data = new GlobalVariable(*M, DataTy, false, NamePtr->getLinkage(), ConstantStruct::get(DataTy, DataVals), - getVarName(Inc, "data")); - Data->setVisibility(Name->getVisibility()); + getVarName(Inc, getInstrProfDataVarPrefix())); + Data->setVisibility(NamePtr->getVisibility()); Data->setSection(getDataSection()); - Data->setAlignment(8); - Data->setComdat(Fn->getComdat()); + Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT); + Data->setComdat(ProfileVarsComdat); + + PD.RegionCounters = CounterPtr; + PD.DataVar = Data; + ProfileDataMap[NamePtr] = PD; // Mark the data variable as used so that it isn't stripped out. UsedVars.push_back(Data); - return Counters; + return CounterPtr; } void InstrProfiling::emitRegistration() { @@ -253,20 +373,24 @@ void InstrProfiling::emitRegistration() { if (Triple(M->getTargetTriple()).isOSDarwin()) return; + // Use linker script magic to get data/cnts/name start/end. + if (Triple(M->getTargetTriple()).isOSLinux() || + Triple(M->getTargetTriple()).isOSFreeBSD()) + return; + // Construct the function. auto *VoidTy = Type::getVoidTy(M->getContext()); auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext()); auto *RegisterFTy = FunctionType::get(VoidTy, false); auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage, - "__llvm_profile_register_functions", M); + getInstrProfRegFuncsName(), M); RegisterF->setUnnamedAddr(true); - if (Options.NoRedZone) - RegisterF->addFnAttr(Attribute::NoRedZone); + if (Options.NoRedZone) RegisterF->addFnAttr(Attribute::NoRedZone); auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false); auto *RuntimeRegisterF = Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage, - "__llvm_profile_register_function", M); + getInstrProfRegFuncName(), M); IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF)); for (Value *Data : UsedVars) @@ -275,26 +399,27 @@ void InstrProfiling::emitRegistration() { } void InstrProfiling::emitRuntimeHook() { - const char *const RuntimeVarName = "__llvm_profile_runtime"; - const char *const RuntimeUserName = "__llvm_profile_runtime_user"; - // If the module's provided its own runtime, we don't need to do anything. - if (M->getGlobalVariable(RuntimeVarName)) + // We expect the linker to be invoked with -u<hook_var> flag for linux, + // for which case there is no need to emit the user function. + if (Triple(M->getTargetTriple()).isOSLinux()) return; + // If the module's provided its own runtime, we don't need to do anything. + if (M->getGlobalVariable(getInstrProfRuntimeHookVarName())) return; + // Declare an external variable that will pull in the runtime initialization. auto *Int32Ty = Type::getInt32Ty(M->getContext()); auto *Var = new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, - nullptr, RuntimeVarName); + nullptr, getInstrProfRuntimeHookVarName()); // Make a function that uses it. - auto *User = - Function::Create(FunctionType::get(Int32Ty, false), - GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M); + auto *User = Function::Create(FunctionType::get(Int32Ty, false), + GlobalValue::LinkOnceODRLinkage, + getInstrProfRuntimeHookVarUseFuncName(), M); User->addFnAttr(Attribute::NoInline); - if (Options.NoRedZone) - User->addFnAttr(Attribute::NoRedZone); + if (Options.NoRedZone) User->addFnAttr(Attribute::NoRedZone); User->setVisibility(GlobalValue::HiddenVisibility); IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User)); @@ -330,26 +455,23 @@ void InstrProfiling::emitUses() { LLVMUsed = new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage, ConstantArray::get(ATy, MergedVars), "llvm.used"); - LLVMUsed->setSection("llvm.metadata"); } void InstrProfiling::emitInitialization() { std::string InstrProfileOutput = Options.InstrProfileOutput; - Constant *RegisterF = M->getFunction("__llvm_profile_register_functions"); - if (!RegisterF && InstrProfileOutput.empty()) - return; + Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName()); + if (!RegisterF && InstrProfileOutput.empty()) return; // Create the initialization function. auto *VoidTy = Type::getVoidTy(M->getContext()); - auto *F = - Function::Create(FunctionType::get(VoidTy, false), - GlobalValue::InternalLinkage, "__llvm_profile_init", M); + auto *F = Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::InternalLinkage, + getInstrProfInitFuncName(), M); F->setUnnamedAddr(true); F->addFnAttr(Attribute::NoInline); - if (Options.NoRedZone) - F->addFnAttr(Attribute::NoRedZone); + if (Options.NoRedZone) F->addFnAttr(Attribute::NoRedZone); // Add the basic block and the necessary calls. IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F)); @@ -358,9 +480,8 @@ void InstrProfiling::emitInitialization() { if (!InstrProfileOutput.empty()) { auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext()); auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false); - auto *SetNameF = - Function::Create(SetNameTy, GlobalValue::ExternalLinkage, - "__llvm_profile_override_default_filename", M); + auto *SetNameF = Function::Create(SetNameTy, GlobalValue::ExternalLinkage, + getInstrProfFileOverriderFuncName(), M); // Create variable for profile name. Constant *ProfileNameConst = diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index 27505859100b..a05a5fa09f9a 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -12,12 +12,47 @@ // //===----------------------------------------------------------------------===// -#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm-c/Initialization.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" using namespace llvm; +/// Moves I before IP. Returns new insert point. +static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) { + // If I is IP, move the insert point down. + if (I == IP) + return ++IP; + // Otherwise, move I before IP and return IP. + I->moveBefore(&*IP); + return IP; +} + +/// Instrumentation passes often insert conditional checks into entry blocks. +/// Call this function before splitting the entry block to move instructions +/// that must remain in the entry block up before the split point. Static +/// allocas and llvm.localescape calls, for example, must remain in the entry +/// block. +BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB, + BasicBlock::iterator IP) { + assert(&BB.getParent()->getEntryBlock() == &BB); + for (auto I = IP, E = BB.end(); I != E; ++I) { + bool KeepInEntry = false; + if (auto *AI = dyn_cast<AllocaInst>(I)) { + if (AI->isStaticAlloca()) + KeepInEntry = true; + } else if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == llvm::Intrinsic::localescape) + KeepInEntry = true; + } + if (KeepInEntry) + IP = moveBeforeInsertPoint(I, IP); + } + return IP; +} + /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { @@ -25,6 +60,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); initializeGCOVProfilerPass(Registry); + initializePGOInstrumentationGenPass(Registry); + initializePGOInstrumentationUsePass(Registry); initializeInstrProfilingPass(Registry); initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 286a56330248..5a7bce5a5413 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -148,7 +148,7 @@ static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call", cl::desc("poison uninitialized stack variables with a call"), cl::Hidden, cl::init(false)); static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern", - cl::desc("poison uninitialized stack variables with the given patter"), + cl::desc("poison uninitialized stack variables with the given pattern"), cl::Hidden, cl::init(0xff)); static cl::opt<bool> ClPoisonUndef("msan-poison-undef", cl::desc("poison undef temps"), @@ -222,10 +222,17 @@ static const MemoryMapParams Linux_I386_MemoryMapParams = { // x86_64 Linux static const MemoryMapParams Linux_X86_64_MemoryMapParams = { +#ifdef MSAN_LINUX_X86_64_OLD_MAPPING 0x400000000000, // AndMask 0, // XorMask (not used) 0, // ShadowBase (not used) 0x200000000000, // OriginBase +#else + 0, // AndMask (not used) + 0x500000000000, // XorMask + 0, // ShadowBase (not used) + 0x100000000000, // OriginBase +#endif }; // mips64 Linux @@ -244,6 +251,14 @@ static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = { 0x1C0000000000, // OriginBase }; +// aarch64 Linux +static const MemoryMapParams Linux_AArch64_MemoryMapParams = { + 0, // AndMask (not used) + 0x06000000000, // XorMask + 0, // ShadowBase (not used) + 0x01000000000, // OriginBase +}; + // i386 FreeBSD static const MemoryMapParams FreeBSD_I386_MemoryMapParams = { 0x000180000000, // AndMask @@ -266,15 +281,20 @@ static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = { }; static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = { - NULL, + nullptr, &Linux_MIPS64_MemoryMapParams, }; static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = { - NULL, + nullptr, &Linux_PowerPC64_MemoryMapParams, }; +static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = { + nullptr, + &Linux_AArch64_MemoryMapParams, +}; + static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = { &FreeBSD_I386_MemoryMapParams, &FreeBSD_X86_64_MemoryMapParams, @@ -353,8 +373,9 @@ class MemorySanitizer : public FunctionPass { friend struct MemorySanitizerVisitor; friend struct VarArgAMD64Helper; friend struct VarArgMIPS64Helper; + friend struct VarArgAArch64Helper; }; -} // namespace +} // anonymous namespace char MemorySanitizer::ID = 0; INITIALIZE_PASS(MemorySanitizer, "msan", @@ -377,7 +398,6 @@ static GlobalVariable *createPrivateNonConstGlobalForString(Module &M, GlobalValue::PrivateLinkage, StrConst, ""); } - /// \brief Insert extern declaration of runtime-provided functions and globals. void MemorySanitizer::initializeCallbacks(Module &M) { // Only do this once. @@ -496,6 +516,10 @@ bool MemorySanitizer::doInitialization(Module &M) { case Triple::ppc64le: MapParams = Linux_PowerPC_MemoryMapParams.bits64; break; + case Triple::aarch64: + case Triple::aarch64_be: + MapParams = Linux_ARM_MemoryMapParams.bits64; + break; default: report_fatal_error("unsupported architecture"); } @@ -697,7 +721,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Cmp = IRB.CreateICmpNE( ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); Instruction *CheckTerm = SplitBlockAndInsertIfThen( - Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights); + Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights); IRBuilder<> IRBNew(CheckTerm); paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), getOriginPtr(Addr, IRBNew, Alignment), StoreSize, @@ -893,16 +917,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// Offset = (Addr & ~AndMask) ^ XorMask Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) { + Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy); + uint64_t AndMask = MS.MapParams->AndMask; - assert(AndMask != 0 && "AndMask shall be specified"); - Value *OffsetLong = - IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, ~AndMask)); + if (AndMask) + OffsetLong = + IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask)); uint64_t XorMask = MS.MapParams->XorMask; - if (XorMask != 0) - OffsetLong = IRB.CreateXor(OffsetLong, - ConstantInt::get(MS.IntptrTy, XorMask)); + if (XorMask) + OffsetLong = + IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask)); return OffsetLong; } @@ -1339,6 +1364,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } void visitBitCastInst(BitCastInst &I) { + // Special case: if this is the bitcast (there is exactly 1 allowed) between + // a musttail call and a ret, don't instrument. New instructions are not + // allowed after a musttail call. + if (auto *CI = dyn_cast<CallInst>(I.getOperand(0))) + if (CI->isMustTailCall()) + return; IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I))); setOrigin(&I, getOrigin(&I, 0)); @@ -1570,18 +1601,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Type *EltTy = Ty->getSequentialElementType(); SmallVector<Constant *, 16> Elements; for (unsigned Idx = 0; Idx < NumElements; ++Idx) { - ConstantInt *Elt = - dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx)); - APInt V = Elt->getValue(); - APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); - Elements.push_back(ConstantInt::get(EltTy, V2)); + if (ConstantInt *Elt = + dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) { + APInt V = Elt->getValue(); + APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); + Elements.push_back(ConstantInt::get(EltTy, V2)); + } else { + Elements.push_back(ConstantInt::get(EltTy, 1)); + } } ShadowMul = ConstantVector::get(Elements); } else { - ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg); - APInt V = Elt->getValue(); - APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); - ShadowMul = ConstantInt::get(Elt->getType(), V2); + if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) { + APInt V = Elt->getValue(); + APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); + ShadowMul = ConstantInt::get(Ty, V2); + } else { + ShadowMul = ConstantInt::get(Ty, 1); + } } IRBuilder<> IRB(&I); @@ -1730,25 +1767,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Instrument signed relational comparisons. /// - /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by - /// propagating the highest bit of the shadow. Everything else is delegated - /// to handleShadowOr(). + /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest + /// bit of the shadow. Everything else is delegated to handleShadowOr(). void handleSignedRelationalComparison(ICmpInst &I) { - Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); - Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); - Value* op = nullptr; - CmpInst::Predicate pre = I.getPredicate(); - if (constOp0 && constOp0->isNullValue() && - (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) { - op = I.getOperand(1); - } else if (constOp1 && constOp1->isNullValue() && - (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) { + Constant *constOp; + Value *op = nullptr; + CmpInst::Predicate pre; + if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) { op = I.getOperand(0); + pre = I.getPredicate(); + } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) { + op = I.getOperand(1); + pre = I.getSwappedPredicate(); + } else { + handleShadowOr(I); + return; } - if (op) { + + if ((constOp->isNullValue() && + (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) || + (constOp->isAllOnesValue() && + (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) { IRBuilder<> IRB(&I); - Value* Shadow = - IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt"); + Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), + "_msprop_icmp_s"); setShadow(&I, Shadow); setOrigin(&I, getOrigin(op)); } else { @@ -1860,25 +1902,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { VAHelper->visitVACopyInst(I); } - enum IntrinsicKind { - IK_DoesNotAccessMemory, - IK_OnlyReadsMemory, - IK_WritesMemory - }; - - static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) { - const int DoesNotAccessMemory = IK_DoesNotAccessMemory; - const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory; - const int OnlyReadsMemory = IK_OnlyReadsMemory; - const int OnlyAccessesArgumentPointees = IK_WritesMemory; - const int UnknownModRefBehavior = IK_WritesMemory; -#define GET_INTRINSIC_MODREF_BEHAVIOR -#define ModRefBehavior IntrinsicKind -#include "llvm/IR/Intrinsics.gen" -#undef ModRefBehavior -#undef GET_INTRINSIC_MODREF_BEHAVIOR - } - /// \brief Handle vector store-like intrinsics. /// /// Instrument intrinsics that look like a simple SIMD store: writes memory, @@ -1978,17 +2001,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (NumArgOperands == 0) return false; - Intrinsic::ID iid = I.getIntrinsicID(); - IntrinsicKind IK = getIntrinsicKind(iid); - bool OnlyReadsMemory = IK == IK_OnlyReadsMemory; - bool WritesMemory = IK == IK_WritesMemory; - assert(!(OnlyReadsMemory && WritesMemory)); - if (NumArgOperands == 2 && I.getArgOperand(0)->getType()->isPointerTy() && I.getArgOperand(1)->getType()->isVectorTy() && I.getType()->isVoidTy() && - WritesMemory) { + !I.onlyReadsMemory()) { // This looks like a vector store. return handleVectorStoreIntrinsic(I); } @@ -1996,12 +2013,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (NumArgOperands == 1 && I.getArgOperand(0)->getType()->isPointerTy() && I.getType()->isVectorTy() && - OnlyReadsMemory) { + I.onlyReadsMemory()) { // This looks like a vector load. return handleVectorLoadIntrinsic(I); } - if (!OnlyReadsMemory && !WritesMemory) + if (I.doesNotAccessMemory()) if (maybeHandleSimpleNomemIntrinsic(I)) return true; @@ -2493,13 +2510,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Now, get the shadow for the RetVal. if (!I.getType()->isSized()) return; + // Don't emit the epilogue for musttail call returns. + if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return; IRBuilder<> IRBBefore(&I); // Until we have full dynamic coverage, make sure the retval shadow is 0. Value *Base = getShadowPtrForRetval(&I, IRBBefore); IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); - Instruction *NextInsn = nullptr; + BasicBlock::iterator NextInsn; if (CS.isCall()) { - NextInsn = I.getNextNode(); + NextInsn = ++I.getIterator(); + assert(NextInsn != I.getParent()->end()); } else { BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest(); if (!NormalDest->getSinglePredecessor()) { @@ -2511,10 +2531,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return; } NextInsn = NormalDest->getFirstInsertionPt(); - assert(NextInsn && + assert(NextInsn != NormalDest->end() && "Could not find insertion point for retval shadow load"); } - IRBuilder<> IRBAfter(NextInsn); + IRBuilder<> IRBAfter(&*NextInsn); Value *RetvalShadow = IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter), kShadowTLSAlignment, "_msret"); @@ -2523,10 +2543,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter))); } + bool isAMustTailRetVal(Value *RetVal) { + if (auto *I = dyn_cast<BitCastInst>(RetVal)) { + RetVal = I->getOperand(0); + } + if (auto *I = dyn_cast<CallInst>(RetVal)) { + return I->isMustTailCall(); + } + return false; + } + void visitReturnInst(ReturnInst &I) { IRBuilder<> IRB(&I); Value *RetVal = I.getReturnValue(); if (!RetVal) return; + // Don't emit the epilogue for musttail call returns. + if (isAMustTailRetVal(RetVal)) return; Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); if (CheckReturnValue) { insertShadowCheck(RetVal, &I); @@ -2653,6 +2685,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, getCleanOrigin()); } + void visitCatchSwitchInst(CatchSwitchInst &I) { + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } + + void visitFuncletPadInst(FuncletPadInst &I) { + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } + void visitGetElementPtrInst(GetElementPtrInst &I) { handleShadowOr(I); } @@ -2696,6 +2738,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Nothing to do here. } + void visitCleanupReturnInst(CleanupReturnInst &CRI) { + DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n"); + // Nothing to do here. + } + + void visitCatchReturnInst(CatchReturnInst &CRI) { + DEBUG(dbgs() << "CatchReturn: " << CRI << "\n"); + // Nothing to do here. + } + void visitInstruction(Instruction &I) { // Everything else: stop propagating and check for poisoned shadow. if (ClDumpStrictInstructions) @@ -2808,6 +2860,8 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVAStartInst(VAStartInst &I) override { + if (F.getCallingConv() == CallingConv::X86_64_Win64) + return; IRBuilder<> IRB(&I); VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); @@ -2820,6 +2874,8 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVACopyInst(VACopyInst &I) override { + if (F.getCallingConv() == CallingConv::X86_64_Win64) + return; IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); @@ -2979,6 +3035,242 @@ struct VarArgMIPS64Helper : public VarArgHelper { } }; + +/// \brief AArch64-specific implementation of VarArgHelper. +struct VarArgAArch64Helper : public VarArgHelper { + static const unsigned kAArch64GrArgSize = 56; + static const unsigned kAArch64VrArgSize = 128; + + static const unsigned AArch64GrBegOffset = 0; + static const unsigned AArch64GrEndOffset = kAArch64GrArgSize; + // Make VR space aligned to 16 bytes. + static const unsigned AArch64VrBegOffset = AArch64GrEndOffset + 8; + static const unsigned AArch64VrEndOffset = AArch64VrBegOffset + + kAArch64VrArgSize; + static const unsigned AArch64VAEndOffset = AArch64VrEndOffset; + + Function &F; + MemorySanitizer &MS; + MemorySanitizerVisitor &MSV; + Value *VAArgTLSCopy; + Value *VAArgOverflowSize; + + SmallVector<CallInst*, 16> VAStartInstrumentationList; + + VarArgAArch64Helper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) + : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr), + VAArgOverflowSize(nullptr) {} + + enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; + + ArgKind classifyArgument(Value* arg) { + Type *T = arg->getType(); + if (T->isFPOrFPVectorTy()) + return AK_FloatingPoint; + if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) + || (T->isPointerTy())) + return AK_GeneralPurpose; + return AK_Memory; + } + + // The instrumentation stores the argument shadow in a non ABI-specific + // format because it does not know which argument is named (since Clang, + // like x86_64 case, lowers the va_args in the frontend and this pass only + // sees the low level code that deals with va_list internals). + // The first seven GR registers are saved in the first 56 bytes of the + // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then + // the remaining arguments. + // Using constant offset within the va_arg TLS array allows fast copy + // in the finalize instrumentation. + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override { + unsigned GrOffset = AArch64GrBegOffset; + unsigned VrOffset = AArch64VrBegOffset; + unsigned OverflowOffset = AArch64VAEndOffset; + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end(); + ArgIt != End; ++ArgIt) { + Value *A = *ArgIt; + ArgKind AK = classifyArgument(A); + if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset) + AK = AK_Memory; + if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset) + AK = AK_Memory; + Value *Base; + switch (AK) { + case AK_GeneralPurpose: + Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset); + GrOffset += 8; + break; + case AK_FloatingPoint: + Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset); + VrOffset += 16; + break; + case AK_Memory: + uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); + Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset); + OverflowOffset += RoundUpToAlignment(ArgSize, 8); + break; + } + IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + } + Constant *OverflowSize = + ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset); + IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); + } + + /// Compute the shadow address for a given va_arg. + Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, + int ArgOffset) { + Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), + "_msarg"); + } + + void visitVAStartInst(VAStartInst &I) override { + IRBuilder<> IRB(&I); + VAStartInstrumentationList.push_back(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants (size of va_list). + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */32, /* alignment */8, false); + } + + void visitVACopyInst(VACopyInst &I) override { + IRBuilder<> IRB(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants (size of va_list). + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */32, /* alignment */8, false); + } + + // Retrieve a va_list field of 'void*' size. + Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) { + Value *SaveAreaPtrPtr = + IRB.CreateIntToPtr( + IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, offset)), + Type::getInt64PtrTy(*MS.C)); + return IRB.CreateLoad(SaveAreaPtrPtr); + } + + // Retrieve a va_list field of 'int' size. + Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) { + Value *SaveAreaPtr = + IRB.CreateIntToPtr( + IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, offset)), + Type::getInt32PtrTy(*MS.C)); + Value *SaveArea32 = IRB.CreateLoad(SaveAreaPtr); + return IRB.CreateSExt(SaveArea32, MS.IntptrTy); + } + + void finalizeInstrumentation() override { + assert(!VAArgOverflowSize && !VAArgTLSCopy && + "finalizeInstrumentation called twice"); + if (!VAStartInstrumentationList.empty()) { + // If there is a va_start in this function, make a backup copy of + // va_arg_tls somewhere in the function entry block. + IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); + VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS); + Value *CopySize = + IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset), + VAArgOverflowSize); + VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8); + } + + Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize); + Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize); + + // Instrument va_start, copy va_list shadow from the backup copy of + // the TLS contents. + for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { + CallInst *OrigInst = VAStartInstrumentationList[i]; + IRBuilder<> IRB(OrigInst->getNextNode()); + + Value *VAListTag = OrigInst->getArgOperand(0); + + // The variadic ABI for AArch64 creates two areas to save the incoming + // argument registers (one for 64-bit general register xn-x7 and another + // for 128-bit FP/SIMD vn-v7). + // We need then to propagate the shadow arguments on both regions + // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'. + // The remaning arguments are saved on shadow for 'va::stack'. + // One caveat is it requires only to propagate the non-named arguments, + // however on the call site instrumentation 'all' the arguments are + // saved. So to copy the shadow values from the va_arg TLS array + // we need to adjust the offset for both GR and VR fields based on + // the __{gr,vr}_offs value (since they are stores based on incoming + // named arguments). + + // Read the stack pointer from the va_list. + Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0); + + // Read both the __gr_top and __gr_off and add them up. + Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8); + Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24); + + Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea); + + // Read both the __vr_top and __vr_off and add them up. + Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16); + Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28); + + Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea); + + // It does not know how many named arguments is being used and, on the + // callsite all the arguments were saved. Since __gr_off is defined as + // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic + // argument by ignoring the bytes of shadow from named arguments. + Value *GrRegSaveAreaShadowPtrOff = + IRB.CreateAdd(GrArgSize, GrOffSaveArea); + + Value *GrRegSaveAreaShadowPtr = + MSV.getShadowPtr(GrRegSaveAreaPtr, IRB.getInt8Ty(), IRB); + + Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, + GrRegSaveAreaShadowPtrOff); + Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff); + + IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8); + + // Again, but for FP/SIMD values. + Value *VrRegSaveAreaShadowPtrOff = + IRB.CreateAdd(VrArgSize, VrOffSaveArea); + + Value *VrRegSaveAreaShadowPtr = + MSV.getShadowPtr(VrRegSaveAreaPtr, IRB.getInt8Ty(), IRB); + + Value *VrSrcPtr = IRB.CreateInBoundsGEP( + IRB.getInt8Ty(), + IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, + IRB.getInt32(AArch64VrBegOffset)), + VrRegSaveAreaShadowPtrOff); + Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff); + + IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8); + + // And finally for remaining arguments. + Value *StackSaveAreaShadowPtr = + MSV.getShadowPtr(StackSaveAreaPtr, IRB.getInt8Ty(), IRB); + + Value *StackSrcPtr = + IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, + IRB.getInt32(AArch64VAEndOffset)); + + IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr, + VAArgOverflowSize, 16); + } + } +}; + /// \brief A no-op implementation of VarArgHelper. struct VarArgNoOpHelper : public VarArgHelper { VarArgNoOpHelper(Function &F, MemorySanitizer &MS, @@ -3003,11 +3295,13 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, else if (TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el) return new VarArgMIPS64Helper(Func, Msan, Visitor); + else if (TargetTriple.getArch() == llvm::Triple::aarch64) + return new VarArgAArch64Helper(Func, Msan, Visitor); else return new VarArgNoOpHelper(Func, Msan, Visitor); } -} // namespace +} // anonymous namespace bool MemorySanitizer::runOnFunction(Function &F) { if (&F == MsanCtorFunction) diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp new file mode 100644 index 000000000000..4b59b93b325f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -0,0 +1,718 @@ +//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements PGO instrumentation using a minimum spanning tree based +// on the following paper: +// [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points +// for program frequency counts. BIT Numerical Mathematics 1973, Volume 13, +// Issue 3, pp 313-322 +// The idea of the algorithm based on the fact that for each node (except for +// the entry and exit), the sum of incoming edge counts equals the sum of +// outgoing edge counts. The count of edge on spanning tree can be derived from +// those edges not on the spanning tree. Knuth proves this method instruments +// the minimum number of edges. +// +// The minimal spanning tree here is actually a maximum weight tree -- on-tree +// edges have higher frequencies (more likely to execute). The idea is to +// instrument those less frequently executed edges to reduce the runtime +// overhead of instrumented binaries. +// +// This file contains two passes: +// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge +// count profile, and +// (2) Pass PGOInstrumentationUse which reads the edge count profile and +// annotates the branch weights. +// To get the precise counter information, These two passes need to invoke at +// the same compilation point (so they see the same IR). For pass +// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For +// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and +// the profile is opened in module level and passed to each PGOUseFunc instance. +// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put +// in class FuncPGOInstrumentation. +// +// Class PGOEdge represents a CFG edge and some auxiliary information. Class +// BBInfo contains auxiliary information for each BB. These two classes are used +// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived +// class of PGOEdge and BBInfo, respectively. They contains extra data structure +// used in populating profile counters. +// The MST implementation is in Class CFGMST (CFGMST.h). +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation.h" +#include "CFGMST.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProfReader.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/JamCRC.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <string> +#include <utility> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "pgo-instrumentation" + +STATISTIC(NumOfPGOInstrument, "Number of edges instrumented."); +STATISTIC(NumOfPGOEdge, "Number of edges."); +STATISTIC(NumOfPGOBB, "Number of basic-blocks."); +STATISTIC(NumOfPGOSplit, "Number of critical edge splits."); +STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts."); +STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile."); +STATISTIC(NumOfPGOMissing, "Number of functions without profile."); + +// Command line option to specify the file to read profile from. This is +// mainly used for testing. +static cl::opt<std::string> + PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path of profile data file. This is" + "mainly for test purpose.")); + +namespace { +class PGOInstrumentationGen : public ModulePass { +public: + static char ID; + + PGOInstrumentationGen() : ModulePass(ID) { + initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "PGOInstrumentationGenPass"; + } + +private: + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + } +}; + +class PGOInstrumentationUse : public ModulePass { +public: + static char ID; + + // Provide the profile filename as the parameter. + PGOInstrumentationUse(std::string Filename = "") + : ModulePass(ID), ProfileFileName(Filename) { + if (!PGOTestProfileFile.empty()) + ProfileFileName = PGOTestProfileFile; + initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "PGOInstrumentationUsePass"; + } + +private: + std::string ProfileFileName; + std::unique_ptr<IndexedInstrProfReader> PGOReader; + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + } +}; +} // end anonymous namespace + +char PGOInstrumentationGen::ID = 0; +INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen", + "PGO instrumentation.", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen", + "PGO instrumentation.", false, false) + +ModulePass *llvm::createPGOInstrumentationGenPass() { + return new PGOInstrumentationGen(); +} + +char PGOInstrumentationUse::ID = 0; +INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use", + "Read PGO instrumentation profile.", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use", + "Read PGO instrumentation profile.", false, false) + +ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) { + return new PGOInstrumentationUse(Filename.str()); +} + +namespace { +/// \brief An MST based instrumentation for PGO +/// +/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO +/// in the function level. +struct PGOEdge { + // This class implements the CFG edges. Note the CFG can be a multi-graph. + // So there might be multiple edges with same SrcBB and DestBB. + const BasicBlock *SrcBB; + const BasicBlock *DestBB; + uint64_t Weight; + bool InMST; + bool Removed; + bool IsCritical; + PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) + : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false), + IsCritical(false) {} + // Return the information string of an edge. + const std::string infoString() const { + return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") + + (IsCritical ? "c" : " ") + " W=" + Twine(Weight)).str(); + } +}; + +// This class stores the auxiliary information for each BB. +struct BBInfo { + BBInfo *Group; + uint32_t Index; + uint32_t Rank; + + BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {} + + // Return the information string of this object. + const std::string infoString() const { + return (Twine("Index=") + Twine(Index)).str(); + } +}; + +// This class implements the CFG edges. Note the CFG can be a multi-graph. +template <class Edge, class BBInfo> class FuncPGOInstrumentation { +private: + Function &F; + void computeCFGHash(); + +public: + std::string FuncName; + GlobalVariable *FuncNameVar; + // CFG hash value for this function. + uint64_t FunctionHash; + + // The Minimum Spanning Tree of function CFG. + CFGMST<Edge, BBInfo> MST; + + // Give an edge, find the BB that will be instrumented. + // Return nullptr if there is no BB to be instrumented. + BasicBlock *getInstrBB(Edge *E); + + // Return the auxiliary BB information. + BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); } + + // Dump edges and BB information. + void dumpInfo(std::string Str = "") const { + MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " + + Twine(FunctionHash) + "\t" + Str); + } + + FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false, + BranchProbabilityInfo *BPI = nullptr, + BlockFrequencyInfo *BFI = nullptr) + : F(Func), FunctionHash(0), MST(F, BPI, BFI) { + FuncName = getPGOFuncName(F); + computeCFGHash(); + DEBUG(dumpInfo("after CFGMST")); + + NumOfPGOBB += MST.BBInfos.size(); + for (auto &E : MST.AllEdges) { + if (E->Removed) + continue; + NumOfPGOEdge++; + if (!E->InMST) + NumOfPGOInstrument++; + } + + if (CreateGlobalVar) + FuncNameVar = createPGOFuncNameVar(F, FuncName); + }; +}; + +// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index +// value of each BB in the CFG. The higher 32 bits record the number of edges. +template <class Edge, class BBInfo> +void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { + std::vector<char> Indexes; + JamCRC JC; + for (auto &BB : F) { + const TerminatorInst *TI = BB.getTerminator(); + for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { + BasicBlock *Succ = TI->getSuccessor(I); + uint32_t Index = getBBInfo(Succ).Index; + for (int J = 0; J < 4; J++) + Indexes.push_back((char)(Index >> (J * 8))); + } + } + JC.update(Indexes); + FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC(); +} + +// Given a CFG E to be instrumented, find which BB to place the instrumented +// code. The function will split the critical edge if necessary. +template <class Edge, class BBInfo> +BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) { + if (E->InMST || E->Removed) + return nullptr; + + BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); + BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); + // For a fake edge, instrument the real BB. + if (SrcBB == nullptr) + return DestBB; + if (DestBB == nullptr) + return SrcBB; + + // Instrument the SrcBB if it has a single successor, + // otherwise, the DestBB if this is not a critical edge. + TerminatorInst *TI = SrcBB->getTerminator(); + if (TI->getNumSuccessors() <= 1) + return SrcBB; + if (!E->IsCritical) + return DestBB; + + // For a critical edge, we have to split. Instrument the newly + // created BB. + NumOfPGOSplit++; + DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> " + << getBBInfo(DestBB).Index << "\n"); + unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); + BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum); + assert(InstrBB && "Critical edge is not split"); + + E->Removed = true; + return InstrBB; +} + +// Visit all edge and instrument the edges not in MST. +// Critical edges will be split. +static void instrumentOneFunc(Function &F, Module *M, + BranchProbabilityInfo *BPI, + BlockFrequencyInfo *BFI) { + unsigned NumCounters = 0; + FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI); + for (auto &E : FuncInfo.MST.AllEdges) { + if (!E->InMST && !E->Removed) + NumCounters++; + } + + uint32_t I = 0; + for (auto &E : FuncInfo.MST.AllEdges) { + BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get()); + if (!InstrBB) + continue; + + IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt()); + assert(Builder.GetInsertPoint() != InstrBB->end() && + "Cannot get the Instrumentation point"); + Type *I8PtrTy = Type::getInt8PtrTy(M->getContext()); + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment), + {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), + Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters), + Builder.getInt32(I++)}); + } +} + +// This class represents a CFG edge in profile use compilation. +struct PGOUseEdge : public PGOEdge { + bool CountValid; + uint64_t CountValue; + PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) + : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {} + + // Set edge count value + void setEdgeCount(uint64_t Value) { + CountValue = Value; + CountValid = true; + } + + // Return the information string for this object. + const std::string infoString() const { + if (!CountValid) + return PGOEdge::infoString(); + return (Twine(PGOEdge::infoString()) + " Count=" + Twine(CountValue)).str(); + } +}; + +typedef SmallVector<PGOUseEdge *, 2> DirectEdges; + +// This class stores the auxiliary information for each BB. +struct UseBBInfo : public BBInfo { + uint64_t CountValue; + bool CountValid; + int32_t UnknownCountInEdge; + int32_t UnknownCountOutEdge; + DirectEdges InEdges; + DirectEdges OutEdges; + UseBBInfo(unsigned IX) + : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0), + UnknownCountOutEdge(0) {} + UseBBInfo(unsigned IX, uint64_t C) + : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0), + UnknownCountOutEdge(0) {} + + // Set the profile count value for this BB. + void setBBInfoCount(uint64_t Value) { + CountValue = Value; + CountValid = true; + } + + // Return the information string of this object. + const std::string infoString() const { + if (!CountValid) + return BBInfo::infoString(); + return (Twine(BBInfo::infoString()) + " Count=" + Twine(CountValue)).str(); + } +}; + +// Sum up the count values for all the edges. +static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) { + uint64_t Total = 0; + for (auto &E : Edges) { + if (E->Removed) + continue; + Total += E->CountValue; + } + return Total; +} + +class PGOUseFunc { +private: + Function &F; + Module *M; + // This member stores the shared information with class PGOGenFunc. + FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo; + + // Return the auxiliary BB information. + UseBBInfo &getBBInfo(const BasicBlock *BB) const { + return FuncInfo.getBBInfo(BB); + } + + // The maximum count value in the profile. This is only used in PGO use + // compilation. + uint64_t ProgramMaxCount; + + // Find the Instrumented BB and set the value. + void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile); + + // Set the edge counter value for the unknown edge -- there should be only + // one unknown edge. + void setEdgeCount(DirectEdges &Edges, uint64_t Value); + + // Return FuncName string; + const std::string getFuncName() const { return FuncInfo.FuncName; } + + // Set the hot/cold inline hints based on the count values. + // FIXME: This function should be removed once the functionality in + // the inliner is implemented. + void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) { + if (ProgramMaxCount == 0) + return; + // Threshold of the hot functions. + const BranchProbability HotFunctionThreshold(1, 100); + // Threshold of the cold functions. + const BranchProbability ColdFunctionThreshold(2, 10000); + if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount)) + F.addFnAttr(llvm::Attribute::InlineHint); + else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount)) + F.addFnAttr(llvm::Attribute::Cold); + } + +public: + PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr, + BlockFrequencyInfo *BFI = nullptr) + : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {} + + // Read counts for the instrumented BB from profile. + bool readCounters(IndexedInstrProfReader *PGOReader); + + // Populate the counts for all BBs. + void populateCounters(); + + // Set the branch weights based on the count values. + void setBranchWeights(); +}; + +// Visit all the edges and assign the count value for the instrumented +// edges and the BB. +void PGOUseFunc::setInstrumentedCounts( + const std::vector<uint64_t> &CountFromProfile) { + + // Use a worklist as we will update the vector during the iteration. + std::vector<PGOUseEdge *> WorkList; + for (auto &E : FuncInfo.MST.AllEdges) + WorkList.push_back(E.get()); + + uint32_t I = 0; + for (auto &E : WorkList) { + BasicBlock *InstrBB = FuncInfo.getInstrBB(E); + if (!InstrBB) + continue; + uint64_t CountValue = CountFromProfile[I++]; + if (!E->Removed) { + getBBInfo(InstrBB).setBBInfoCount(CountValue); + E->setEdgeCount(CountValue); + continue; + } + + // Need to add two new edges. + BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); + BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); + // Add new edge of SrcBB->InstrBB. + PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0); + NewEdge.setEdgeCount(CountValue); + // Add new edge of InstrBB->DestBB. + PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0); + NewEdge1.setEdgeCount(CountValue); + NewEdge1.InMST = true; + getBBInfo(InstrBB).setBBInfoCount(CountValue); + } +} + +// Set the count value for the unknown edge. There should be one and only one +// unknown edge in Edges vector. +void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) { + for (auto &E : Edges) { + if (E->CountValid) + continue; + E->setEdgeCount(Value); + + getBBInfo(E->SrcBB).UnknownCountOutEdge--; + getBBInfo(E->DestBB).UnknownCountInEdge--; + return; + } + llvm_unreachable("Cannot find the unknown count edge"); +} + +// Read the profile from ProfileFileName and assign the value to the +// instrumented BB and the edges. This function also updates ProgramMaxCount. +// Return true if the profile are successfully read, and false on errors. +bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) { + auto &Ctx = M->getContext(); + ErrorOr<InstrProfRecord> Result = + PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash); + if (std::error_code EC = Result.getError()) { + if (EC == instrprof_error::unknown_function) + NumOfPGOMissing++; + else if (EC == instrprof_error::hash_mismatch || + EC == llvm::instrprof_error::malformed) + NumOfPGOMismatch++; + + std::string Msg = EC.message() + std::string(" ") + F.getName().str(); + Ctx.diagnose( + DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning)); + return false; + } + std::vector<uint64_t> &CountFromProfile = Result.get().Counts; + + NumOfPGOFunc++; + DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); + uint64_t ValueSum = 0; + for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) { + DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n"); + ValueSum += CountFromProfile[I]; + } + + DEBUG(dbgs() << "SUM = " << ValueSum << "\n"); + + getBBInfo(nullptr).UnknownCountOutEdge = 2; + getBBInfo(nullptr).UnknownCountInEdge = 2; + + setInstrumentedCounts(CountFromProfile); + ProgramMaxCount = PGOReader->getMaximumFunctionCount(); + return true; +} + +// Populate the counters from instrumented BBs to all BBs. +// In the end of this operation, all BBs should have a valid count value. +void PGOUseFunc::populateCounters() { + // First set up Count variable for all BBs. + for (auto &E : FuncInfo.MST.AllEdges) { + if (E->Removed) + continue; + + const BasicBlock *SrcBB = E->SrcBB; + const BasicBlock *DestBB = E->DestBB; + UseBBInfo &SrcInfo = getBBInfo(SrcBB); + UseBBInfo &DestInfo = getBBInfo(DestBB); + SrcInfo.OutEdges.push_back(E.get()); + DestInfo.InEdges.push_back(E.get()); + SrcInfo.UnknownCountOutEdge++; + DestInfo.UnknownCountInEdge++; + + if (!E->CountValid) + continue; + DestInfo.UnknownCountInEdge--; + SrcInfo.UnknownCountOutEdge--; + } + + bool Changes = true; + unsigned NumPasses = 0; + while (Changes) { + NumPasses++; + Changes = false; + + // For efficient traversal, it's better to start from the end as most + // of the instrumented edges are at the end. + for (auto &BB : reverse(F)) { + UseBBInfo &Count = getBBInfo(&BB); + if (!Count.CountValid) { + if (Count.UnknownCountOutEdge == 0) { + Count.CountValue = sumEdgeCount(Count.OutEdges); + Count.CountValid = true; + Changes = true; + } else if (Count.UnknownCountInEdge == 0) { + Count.CountValue = sumEdgeCount(Count.InEdges); + Count.CountValid = true; + Changes = true; + } + } + if (Count.CountValid) { + if (Count.UnknownCountOutEdge == 1) { + uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges); + setEdgeCount(Count.OutEdges, Total); + Changes = true; + } + if (Count.UnknownCountInEdge == 1) { + uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges); + setEdgeCount(Count.InEdges, Total); + Changes = true; + } + } + } + } + + DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n"); + // Assert every BB has a valid counter. + uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; + uint64_t FuncMaxCount = FuncEntryCount; + for (auto &BB : F) { + assert(getBBInfo(&BB).CountValid && "BB count is not valid"); + uint64_t Count = getBBInfo(&BB).CountValue; + if (Count > FuncMaxCount) + FuncMaxCount = Count; + } + applyFunctionAttributes(FuncEntryCount, FuncMaxCount); + + DEBUG(FuncInfo.dumpInfo("after reading profile.")); +} + +// Assign the scaled count values to the BB with multiple out edges. +void PGOUseFunc::setBranchWeights() { + // Generate MD_prof metadata for every branch instruction. + DEBUG(dbgs() << "\nSetting branch weights.\n"); + MDBuilder MDB(M->getContext()); + for (auto &BB : F) { + TerminatorInst *TI = BB.getTerminator(); + if (TI->getNumSuccessors() < 2) + continue; + if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) + continue; + if (getBBInfo(&BB).CountValue == 0) + continue; + + // We have a non-zero Branch BB. + const UseBBInfo &BBCountInfo = getBBInfo(&BB); + unsigned Size = BBCountInfo.OutEdges.size(); + SmallVector<unsigned, 2> EdgeCounts(Size, 0); + uint64_t MaxCount = 0; + for (unsigned s = 0; s < Size; s++) { + const PGOUseEdge *E = BBCountInfo.OutEdges[s]; + const BasicBlock *SrcBB = E->SrcBB; + const BasicBlock *DestBB = E->DestBB; + if (DestBB == 0) + continue; + unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); + uint64_t EdgeCount = E->CountValue; + if (EdgeCount > MaxCount) + MaxCount = EdgeCount; + EdgeCounts[SuccNum] = EdgeCount; + } + assert(MaxCount > 0 && "Bad max count"); + uint64_t Scale = calculateCountScale(MaxCount); + SmallVector<unsigned, 4> Weights; + for (const auto &ECI : EdgeCounts) + Weights.push_back(scaleBranchCount(ECI, Scale)); + + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + DEBUG(dbgs() << "Weight is: "; + for (const auto &W : Weights) { dbgs() << W << " "; } + dbgs() << "\n";); + } +} +} // end anonymous namespace + +bool PGOInstrumentationGen::runOnModule(Module &M) { + for (auto &F : M) { + if (F.isDeclaration()) + continue; + BranchProbabilityInfo *BPI = + &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI()); + BlockFrequencyInfo *BFI = + &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI()); + instrumentOneFunc(F, &M, BPI, BFI); + } + return true; +} + +static void setPGOCountOnFunc(PGOUseFunc &Func, + IndexedInstrProfReader *PGOReader) { + if (Func.readCounters(PGOReader)) { + Func.populateCounters(); + Func.setBranchWeights(); + } +} + +bool PGOInstrumentationUse::runOnModule(Module &M) { + DEBUG(dbgs() << "Read in profile counters: "); + auto &Ctx = M.getContext(); + // Read the counter array from file. + auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName); + if (std::error_code EC = ReaderOrErr.getError()) { + Ctx.diagnose( + DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message())); + return false; + } + + PGOReader = std::move(ReaderOrErr.get()); + if (!PGOReader) { + Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(), + "Cannot get PGOReader")); + return false; + } + + for (auto &F : M) { + if (F.isDeclaration()) + continue; + BranchProbabilityInfo *BPI = + &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI()); + BlockFrequencyInfo *BFI = + &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI()); + PGOUseFunc Func(F, &M, BPI, BFI); + setPGOCountOnFunc(Func, PGOReader.get()); + } + return true; +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp index 6b185a2b127b..abed465f102d 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp @@ -18,8 +18,9 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -37,6 +38,8 @@ #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_os_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -44,6 +47,17 @@ using namespace llvm; #define DEBUG_TYPE "safestack" +enum UnsafeStackPtrStorageVal { ThreadLocalUSP, SingleThreadUSP }; + +static cl::opt<UnsafeStackPtrStorageVal> USPStorage("safe-stack-usp-storage", + cl::Hidden, cl::init(ThreadLocalUSP), + cl::desc("Type of storage for the unsafe stack pointer"), + cl::values(clEnumValN(ThreadLocalUSP, "thread-local", + "Thread-local storage"), + clEnumValN(SingleThreadUSP, "single-thread", + "Non-thread-local storage"), + clEnumValEnd)); + namespace llvm { STATISTIC(NumFunctions, "Total number of functions"); @@ -54,118 +68,48 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions, STATISTIC(NumAllocas, "Total number of allocas"); STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas"); STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas"); +STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments"); STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads"); } // namespace llvm namespace { -/// Check whether a given alloca instruction (AI) should be put on the safe -/// stack or not. The function analyzes all uses of AI and checks whether it is -/// only accessed in a memory safe way (as decided statically). -bool IsSafeStackAlloca(const AllocaInst *AI) { - // Go through all uses of this alloca and check whether all accesses to the - // allocated object are statically known to be memory safe and, hence, the - // object can be placed on the safe stack. - - SmallPtrSet<const Value *, 16> Visited; - SmallVector<const Instruction *, 8> WorkList; - WorkList.push_back(AI); +/// Rewrite an SCEV expression for a memory access address to an expression that +/// represents offset from the given alloca. +/// +/// The implementation simply replaces all mentions of the alloca with zero. +class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> { + const Value *AllocaPtr; - // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. - while (!WorkList.empty()) { - const Instruction *V = WorkList.pop_back_val(); - for (const Use &UI : V->uses()) { - auto I = cast<const Instruction>(UI.getUser()); - assert(V == UI.get()); - - switch (I->getOpcode()) { - case Instruction::Load: - // Loading from a pointer is safe. - break; - case Instruction::VAArg: - // "va-arg" from a pointer is safe. - break; - case Instruction::Store: - if (V == I->getOperand(0)) - // Stored the pointer - conservatively assume it may be unsafe. - return false; - // Storing to the pointee is safe. - break; - - case Instruction::GetElementPtr: - if (!cast<const GetElementPtrInst>(I)->hasAllConstantIndices()) - // GEP with non-constant indices can lead to memory errors. - // This also applies to inbounds GEPs, as the inbounds attribute - // represents an assumption that the address is in bounds, rather than - // an assertion that it is. - return false; - - // We assume that GEP on static alloca with constant indices is safe, - // otherwise a compiler would detect it and warn during compilation. - - if (!isa<const ConstantInt>(AI->getArraySize())) - // However, if the array size itself is not constant, the access - // might still be unsafe at runtime. - return false; - - /* fallthrough */ - - case Instruction::BitCast: - case Instruction::IntToPtr: - case Instruction::PHI: - case Instruction::PtrToInt: - case Instruction::Select: - // The object can be safe or not, depending on how the result of the - // instruction is used. - if (Visited.insert(I).second) - WorkList.push_back(cast<const Instruction>(I)); - break; - - case Instruction::Call: - case Instruction::Invoke: { - // FIXME: add support for memset and memcpy intrinsics. - ImmutableCallSite CS(I); - - // LLVM 'nocapture' attribute is only set for arguments whose address - // is not stored, passed around, or used in any other non-trivial way. - // We assume that passing a pointer to an object as a 'nocapture' - // argument is safe. - // FIXME: a more precise solution would require an interprocedural - // analysis here, which would look at all uses of an argument inside - // the function being called. - ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); - for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) - if (A->get() == V && !CS.doesNotCapture(A - B)) - // The parameter is not marked 'nocapture' - unsafe. - return false; - continue; - } +public: + AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr) + : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {} - default: - // The object is unsafe if it is used in any other way. - return false; - } - } + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + if (Expr->getValue() == AllocaPtr) + return SE.getZero(Expr->getType()); + return Expr; } +}; - // All uses of the alloca are safe, we can place it on the safe stack. - return true; -} - -/// The SafeStack pass splits the stack of each function into the -/// safe stack, which is only accessed through memory safe dereferences -/// (as determined statically), and the unsafe stack, which contains all -/// local variables that are accessed in unsafe ways. +/// The SafeStack pass splits the stack of each function into the safe +/// stack, which is only accessed through memory safe dereferences (as +/// determined statically), and the unsafe stack, which contains all +/// local variables that are accessed in ways that we can't prove to +/// be safe. class SafeStack : public FunctionPass { + const TargetMachine *TM; + const TargetLoweringBase *TL; const DataLayout *DL; + ScalarEvolution *SE; Type *StackPtrTy; Type *IntPtrTy; Type *Int32Ty; Type *Int8Ty; - Constant *UnsafeStackPtr = nullptr; + Value *UnsafeStackPtr = nullptr; /// Unsafe stack alignment. Each stack frame must ensure that the stack is /// aligned to this value. We need to re-align the unsafe stack if the @@ -175,26 +119,31 @@ class SafeStack : public FunctionPass { /// might expect to appear on the stack on most common targets. enum { StackAlignment = 16 }; - /// \brief Build a constant representing a pointer to the unsafe stack - /// pointer. - Constant *getOrCreateUnsafeStackPtr(Module &M); + /// \brief Build a value representing a pointer to the unsafe stack pointer. + Value *getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F); /// \brief Find all static allocas, dynamic allocas, return instructions and /// stack restore points (exception unwind blocks and setjmp calls) in the /// given function and append them to the respective vectors. void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas, SmallVectorImpl<AllocaInst *> &DynamicAllocas, + SmallVectorImpl<Argument *> &ByValArguments, SmallVectorImpl<ReturnInst *> &Returns, SmallVectorImpl<Instruction *> &StackRestorePoints); + /// \brief Calculate the allocation size of a given alloca. Returns 0 if the + /// size can not be statically determined. + uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI); + /// \brief Allocate space for all static allocas in \p StaticAllocas, /// replace allocas with pointers into the unsafe stack and generate code to /// restore the stack pointer before all return instructions in \p Returns. /// /// \returns A pointer to the top of the unsafe stack after all unsafe static /// allocas are allocated. - Value *moveStaticAllocasToUnsafeStack(Function &F, + Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas, + ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns); /// \brief Generate code to restore the stack after all stack restore points @@ -203,7 +152,7 @@ class SafeStack : public FunctionPass { /// \returns A local variable in which to maintain the dynamic top of the /// unsafe stack if needed. AllocaInst * - createStackRestorePoints(Function &F, + createStackRestorePoints(IRBuilder<> &IRB, Function &F, ArrayRef<Instruction *> StackRestorePoints, Value *StaticTop, bool NeedDynamicTop); @@ -214,17 +163,26 @@ class SafeStack : public FunctionPass { AllocaInst *DynamicTop, ArrayRef<AllocaInst *> DynamicAllocas); + bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize); + + bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, + const Value *AllocaPtr, uint64_t AllocaSize); + bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr, + uint64_t AllocaSize); + public: static char ID; // Pass identification, replacement for typeid. - SafeStack() : FunctionPass(ID), DL(nullptr) { + SafeStack(const TargetMachine *TM) + : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) { initializeSafeStackPass(*PassRegistry::getPassRegistry()); } + SafeStack() : SafeStack(nullptr) {} - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); } - virtual bool doInitialization(Module &M) { + bool doInitialization(Module &M) override { DL = &M.getDataLayout(); StackPtrTy = Type::getInt8PtrTy(M.getContext()); @@ -235,51 +193,203 @@ public: return false; } - bool runOnFunction(Function &F); - + bool runOnFunction(Function &F) override; }; // class SafeStack -Constant *SafeStack::getOrCreateUnsafeStackPtr(Module &M) { - // The unsafe stack pointer is stored in a global variable with a magic name. - const char *kUnsafeStackPtrVar = "__safestack_unsafe_stack_ptr"; +uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { + uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + if (AI->isArrayAllocation()) { + auto C = dyn_cast<ConstantInt>(AI->getArraySize()); + if (!C) + return 0; + Size *= C->getZExtValue(); + } + return Size; +} + +bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, + const Value *AllocaPtr, uint64_t AllocaSize) { + AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); + + uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); + ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); + ConstantRange SizeRange = + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); + ConstantRange AccessRange = AccessStartRange.add(SizeRange); + ConstantRange AllocaRange = + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize)); + bool Safe = AllocaRange.contains(AccessRange); + + DEBUG(dbgs() << "[SafeStack] " + << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ") + << *AllocaPtr << "\n" + << " Access " << *Addr << "\n" + << " SCEV " << *Expr + << " U: " << SE->getUnsignedRange(Expr) + << ", S: " << SE->getSignedRange(Expr) << "\n" + << " Range " << AccessRange << "\n" + << " AllocaRange " << AllocaRange << "\n" + << " " << (Safe ? "safe" : "unsafe") << "\n"); + + return Safe; +} + +bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, + const Value *AllocaPtr, + uint64_t AllocaSize) { + // All MemIntrinsics have destination address in Arg0 and size in Arg2. + if (MI->getRawDest() != U) return true; + const auto *Len = dyn_cast<ConstantInt>(MI->getLength()); + // Non-constant size => unsafe. FIXME: try SCEV getRange. + if (!Len) return false; + return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize); +} + +/// Check whether a given allocation must be put on the safe +/// stack or not. The function analyzes all uses of AI and checks whether it is +/// only accessed in a memory safe way (as decided statically). +bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { + // Go through all uses of this alloca and check whether all accesses to the + // allocated object are statically known to be memory safe and, hence, the + // object can be placed on the safe stack. + SmallPtrSet<const Value *, 16> Visited; + SmallVector<const Value *, 8> WorkList; + WorkList.push_back(AllocaPtr); + + // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. + while (!WorkList.empty()) { + const Value *V = WorkList.pop_back_val(); + for (const Use &UI : V->uses()) { + auto I = cast<const Instruction>(UI.getUser()); + assert(V == UI.get()); + + switch (I->getOpcode()) { + case Instruction::Load: { + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + AllocaSize)) + return false; + break; + } + case Instruction::VAArg: + // "va-arg" from a pointer is safe. + break; + case Instruction::Store: { + if (V == I->getOperand(0)) { + // Stored the pointer - conservatively assume it may be unsafe. + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr + << "\n store of address: " << *I << "\n"); + return false; + } + + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + AllocaPtr, AllocaSize)) + return false; + break; + } + case Instruction::Ret: { + // Information leak. + return false; + } + + case Instruction::Call: + case Instruction::Invoke: { + ImmutableCallSite CS(I); + + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) + continue; + } + + if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { + if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr + << "\n unsafe memintrinsic: " << *I + << "\n"); + return false; + } + continue; + } + // LLVM 'nocapture' attribute is only set for arguments whose address + // is not stored, passed around, or used in any other non-trivial way. + // We assume that passing a pointer to an object as a 'nocapture + // readnone' argument is safe. + // FIXME: a more precise solution would require an interprocedural + // analysis here, which would look at all uses of an argument inside + // the function being called. + ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); + for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) + if (A->get() == V) + if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) || + CS.doesNotAccessMemory()))) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr + << "\n unsafe call: " << *I << "\n"); + return false; + } + continue; + } + + default: + if (Visited.insert(I).second) + WorkList.push_back(cast<const Instruction>(I)); + } + } + } + + // All uses of the alloca are safe, we can place it on the safe stack. + return true; +} + +Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) { + // Check if there is a target-specific location for the unsafe stack pointer. + if (TL) + if (Value *V = TL->getSafeStackPointerLocation(IRB)) + return V; + + // Otherwise, assume the target links with compiler-rt, which provides a + // thread-local variable with a magic name. + Module &M = *F.getParent(); + const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr"; auto UnsafeStackPtr = - dyn_cast_or_null<GlobalVariable>(M.getNamedValue(kUnsafeStackPtrVar)); + dyn_cast_or_null<GlobalVariable>(M.getNamedValue(UnsafeStackPtrVar)); + + bool UseTLS = USPStorage == ThreadLocalUSP; if (!UnsafeStackPtr) { + auto TLSModel = UseTLS ? + GlobalValue::InitialExecTLSModel : + GlobalValue::NotThreadLocal; // The global variable is not defined yet, define it ourselves. - // We use the initial-exec TLS model because we do not support the variable - // living anywhere other than in the main executable. + // We use the initial-exec TLS model because we do not support the + // variable living anywhere other than in the main executable. UnsafeStackPtr = new GlobalVariable( - /*Module=*/M, /*Type=*/StackPtrTy, - /*isConstant=*/false, /*Linkage=*/GlobalValue::ExternalLinkage, - /*Initializer=*/0, /*Name=*/kUnsafeStackPtrVar, - /*InsertBefore=*/nullptr, - /*ThreadLocalMode=*/GlobalValue::InitialExecTLSModel); + M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr, + UnsafeStackPtrVar, nullptr, TLSModel); } else { // The variable exists, check its type and attributes. - if (UnsafeStackPtr->getValueType() != StackPtrTy) { - report_fatal_error(Twine(kUnsafeStackPtrVar) + " must have void* type"); - } - - if (!UnsafeStackPtr->isThreadLocal()) { - report_fatal_error(Twine(kUnsafeStackPtrVar) + " must be thread-local"); - } + if (UnsafeStackPtr->getValueType() != StackPtrTy) + report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type"); + if (UseTLS != UnsafeStackPtr->isThreadLocal()) + report_fatal_error(Twine(UnsafeStackPtrVar) + " must " + + (UseTLS ? "" : "not ") + "be thread-local"); } - return UnsafeStackPtr; } void SafeStack::findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas, SmallVectorImpl<AllocaInst *> &DynamicAllocas, + SmallVectorImpl<Argument *> &ByValArguments, SmallVectorImpl<ReturnInst *> &Returns, SmallVectorImpl<Instruction *> &StackRestorePoints) { - for (Instruction &I : inst_range(&F)) { + for (Instruction &I : instructions(&F)) { if (auto AI = dyn_cast<AllocaInst>(&I)) { ++NumAllocas; - if (IsSafeStackAlloca(AI)) + uint64_t Size = getStaticAllocaAllocationSize(AI); + if (IsSafeStackAlloca(AI, Size)) continue; if (AI->isStaticAlloca()) { @@ -304,19 +414,26 @@ void SafeStack::findInsts(Function &F, "gcroot intrinsic not compatible with safestack attribute"); } } + for (Argument &Arg : F.args()) { + if (!Arg.hasByValAttr()) + continue; + uint64_t Size = + DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + if (IsSafeStackAlloca(&Arg, Size)) + continue; + + ++NumUnsafeByValArguments; + ByValArguments.push_back(&Arg); + } } AllocaInst * -SafeStack::createStackRestorePoints(Function &F, +SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F, ArrayRef<Instruction *> StackRestorePoints, Value *StaticTop, bool NeedDynamicTop) { if (StackRestorePoints.empty()) return nullptr; - IRBuilder<> IRB(StaticTop - ? cast<Instruction>(StaticTop)->getNextNode() - : (Instruction *)F.getEntryBlock().getFirstInsertionPt()); - // We need the current value of the shadow stack pointer to restore // after longjmp or exception catching. @@ -342,7 +459,7 @@ SafeStack::createStackRestorePoints(Function &F, for (Instruction *I : StackRestorePoints) { ++NumUnsafeStackRestorePoints; - IRB.SetInsertPoint(cast<Instruction>(I->getNextNode())); + IRB.SetInsertPoint(I->getNextNode()); Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop; IRB.CreateStore(CurrentTop, UnsafeStackPtr); } @@ -350,14 +467,12 @@ SafeStack::createStackRestorePoints(Function &F, return DynamicTop; } -Value * -SafeStack::moveStaticAllocasToUnsafeStack(Function &F, - ArrayRef<AllocaInst *> StaticAllocas, - ArrayRef<ReturnInst *> Returns) { - if (StaticAllocas.empty()) +Value *SafeStack::moveStaticAllocasToUnsafeStack( + IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas, + ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) { + if (StaticAllocas.empty() && ByValArguments.empty()) return nullptr; - IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt()); DIBuilder DIB(*F.getParent()); // We explicitly compute and set the unsafe stack layout for all unsafe @@ -377,6 +492,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, // Compute maximum alignment among static objects on the unsafe stack. unsigned MaxAlignment = 0; + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + if (Align > MaxAlignment) + MaxAlignment = Align; + } for (AllocaInst *AI : StaticAllocas) { Type *Ty = AI->getAllocatedType(); unsigned Align = @@ -388,22 +510,51 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, if (MaxAlignment > StackAlignment) { // Re-align the base pointer according to the max requested alignment. assert(isPowerOf2_32(MaxAlignment)); - IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); BasePointer = cast<Instruction>(IRB.CreateIntToPtr( IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy), ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))), StackPtrTy)); } - // Allocate space for every unsafe static AllocaInst on the unsafe stack. int64_t StaticOffset = 0; // Current stack top. + IRB.SetInsertPoint(BasePointer->getNextNode()); + + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + + uint64_t Size = DL->getTypeStoreSize(Ty); + if (Size == 0) + Size = 1; // Don't create zero-sized stack objects. + + // Ensure the object is properly aligned. + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + + // Add alignment. + // NOTE: we ensure that BasePointer itself is aligned to >= Align. + StaticOffset += Size; + StaticOffset = RoundUpToAlignment(StaticOffset, Align); + + Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8* + ConstantInt::get(Int32Ty, -StaticOffset)); + Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(), + Arg->getName() + ".unsafe-byval"); + + // Replace alloc with the new location. + replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, + /*Deref=*/true, -StaticOffset); + Arg->replaceAllUsesWith(NewArg); + IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode()); + IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); + } + + // Allocate space for every unsafe static AllocaInst on the unsafe stack. for (AllocaInst *AI : StaticAllocas) { IRB.SetInsertPoint(AI); - auto CArraySize = cast<ConstantInt>(AI->getArraySize()); Type *Ty = AI->getAllocatedType(); - - uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue(); + uint64_t Size = getStaticAllocaAllocationSize(AI); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. @@ -423,7 +574,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, cast<Instruction>(NewAI)->takeName(AI); // Replace alloc with the new location. - replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true); + replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -StaticOffset); AI->replaceAllUsesWith(NewAI); AI->eraseFromParent(); } @@ -434,7 +585,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment); // Update shadow stack pointer in the function epilogue. - IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); Value *StaticTop = IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset), @@ -478,7 +629,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( if (DynamicTop) IRB.CreateStore(NewTop, DynamicTop); - Value *NewAI = IRB.CreateIntToPtr(SP, AI->getType()); + Value *NewAI = IRB.CreatePointerCast(NewTop, AI->getType()); if (AI->hasName() && isa<Instruction>(NewAI)) NewAI->takeName(AI); @@ -513,8 +664,6 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( } bool SafeStack::runOnFunction(Function &F) { - auto AA = &getAnalysis<AliasAnalysis>(); - DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); if (!F.hasFnAttribute(Attribute::SafeStack)) { @@ -529,6 +678,9 @@ bool SafeStack::runOnFunction(Function &F) { return false; } + TL = TM ? TM->getSubtargetImpl(F)->getTargetLowering() : nullptr; + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + { // Make sure the regular stack protector won't run on this function // (safestack attribute takes precedence). @@ -541,16 +693,11 @@ bool SafeStack::runOnFunction(Function &F) { AttributeSet::get(F.getContext(), AttributeSet::FunctionIndex, B)); } - if (AA->onlyReadsMemory(&F)) { - // XXX: we don't protect against information leak attacks for now. - DEBUG(dbgs() << "[SafeStack] function only reads memory\n"); - return false; - } - ++NumFunctions; SmallVector<AllocaInst *, 16> StaticAllocas; SmallVector<AllocaInst *, 4> DynamicAllocas; + SmallVector<Argument *, 4> ByValArguments; SmallVector<ReturnInst *, 4> Returns; // Collect all points where stack gets unwound and needs to be restored @@ -562,23 +709,26 @@ bool SafeStack::runOnFunction(Function &F) { // Find all static and dynamic alloca instructions that must be moved to the // unsafe stack, all return instructions and stack restore points. - findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints); + findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns, + StackRestorePoints); if (StaticAllocas.empty() && DynamicAllocas.empty() && - StackRestorePoints.empty()) + ByValArguments.empty() && StackRestorePoints.empty()) return false; // Nothing to do in this function. - if (!StaticAllocas.empty() || !DynamicAllocas.empty()) + if (!StaticAllocas.empty() || !DynamicAllocas.empty() || + !ByValArguments.empty()) ++NumUnsafeStackFunctions; // This function has the unsafe stack. if (!StackRestorePoints.empty()) ++NumUnsafeStackRestorePointsFunctions; - if (!UnsafeStackPtr) - UnsafeStackPtr = getOrCreateUnsafeStackPtr(*F.getParent()); + IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt()); + UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F); // The top of the unsafe stack after all unsafe static allocas are allocated. - Value *StaticTop = moveStaticAllocasToUnsafeStack(F, StaticAllocas, Returns); + Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, + ByValArguments, Returns); // Safe stack object that stores the current unsafe stack top. It is updated // as unsafe dynamic (non-constant-sized) allocas are allocated and freed. @@ -587,7 +737,7 @@ bool SafeStack::runOnFunction(Function &F) { // FIXME: a better alternative might be to store the unsafe stack pointer // before setjmp / invoke instructions. AllocaInst *DynamicTop = createStackRestorePoints( - F, StackRestorePoints, StaticTop, !DynamicAllocas.empty()); + IRB, F, StackRestorePoints, StaticTop, !DynamicAllocas.empty()); // Handle dynamic allocas. moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop, @@ -597,13 +747,14 @@ bool SafeStack::runOnFunction(Function &F) { return true; } -} // end anonymous namespace +} // anonymous namespace char SafeStack::ID = 0; -INITIALIZE_PASS_BEGIN(SafeStack, "safe-stack", - "Safe Stack instrumentation pass", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(SafeStack, "safe-stack", "Safe Stack instrumentation pass", - false, false) +INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack", + "Safe Stack instrumentation pass", false, false) +INITIALIZE_TM_PASS_END(SafeStack, "safe-stack", + "Safe Stack instrumentation pass", false, false) -FunctionPass *llvm::createSafeStackPass() { return new SafeStack(); } +FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) { + return new SafeStack(TM); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 7a5b4cb0178b..09de7a2cda2b 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -31,6 +31,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -59,6 +60,7 @@ static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16"; static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter"; static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block"; static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp"; +static const char *const kSanCovTraceSwitch = "__sanitizer_cov_trace_switch"; static const char *const kSanCovModuleCtorName = "sancov.module_ctor"; static const uint64_t kSanCtorAndDtorPriority = 2; @@ -148,19 +150,25 @@ class SanitizerCoverageModule : public ModulePass { void InjectCoverageForIndirectCalls(Function &F, ArrayRef<Instruction *> IndirCalls); void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets); + void InjectTraceForSwitch(Function &F, + ArrayRef<Instruction *> SwitchTraceTargets); bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks); void SetNoSanitizeMetadata(Instruction *I); void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls); unsigned NumberOfInstrumentedBlocks() { - return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses(); + return SanCovFunction->getNumUses() + + SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() + + SanCovTraceEnter->getNumUses(); } Function *SanCovFunction; Function *SanCovWithCheckFunction; Function *SanCovIndirCallFunction; Function *SanCovTraceEnter, *SanCovTraceBB; Function *SanCovTraceCmpFunction; + Function *SanCovTraceSwitchFunction; InlineAsm *EmptyAsm; - Type *IntptrTy, *Int64Ty; + Type *IntptrTy, *Int64Ty, *Int64PtrTy; + Module *CurModule; LLVMContext *C; const DataLayout *DL; @@ -177,11 +185,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { return false; C = &(M.getContext()); DL = &M.getDataLayout(); + CurModule = &M; IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits()); Type *VoidTy = Type::getVoidTy(*C); IRBuilder<> IRB(*C); Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); + Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty()); Int64Ty = IRB.getInt64Ty(); SanCovFunction = checkSanitizerInterfaceFunction( @@ -194,18 +204,19 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { SanCovTraceCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction( kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr)); + SanCovTraceSwitchFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kSanCovTraceSwitch, VoidTy, Int64Ty, Int64PtrTy, nullptr)); // We insert an empty inline asm after cov callbacks to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); - if (Options.TraceBB) { - SanCovTraceEnter = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); - SanCovTraceBB = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); - } + SanCovTraceEnter = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); + SanCovTraceBB = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); // At this point we create a dummy array of guards because we don't // know how many elements we will need. @@ -280,11 +291,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { if (F.empty()) return false; if (F.getName().find(".module_ctor") != std::string::npos) return false; // Should not instrument sanitizer init functions. + // Don't instrument functions using SEH for now. Splitting basic blocks like + // we do for coverage breaks WinEHPrepare. + // FIXME: Remove this when SEH no longer uses landingpad pattern matching. + if (F.hasPersonalityFn() && + isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) + return false; if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) SplitAllCriticalEdges(F); SmallVector<Instruction*, 8> IndirCalls; SmallVector<BasicBlock*, 16> AllBlocks; SmallVector<Instruction*, 8> CmpTraceTargets; + SmallVector<Instruction*, 8> SwitchTraceTargets; for (auto &BB : F) { AllBlocks.push_back(&BB); for (auto &Inst : BB) { @@ -293,13 +311,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { if (CS && !CS.getCalledFunction()) IndirCalls.push_back(&Inst); } - if (Options.TraceCmp && isa<ICmpInst>(&Inst)) - CmpTraceTargets.push_back(&Inst); + if (Options.TraceCmp) { + if (isa<ICmpInst>(&Inst)) + CmpTraceTargets.push_back(&Inst); + if (isa<SwitchInst>(&Inst)) + SwitchTraceTargets.push_back(&Inst); + } } } InjectCoverage(F, AllBlocks); InjectCoverageForIndirectCalls(F, IndirCalls); InjectTraceForCmp(F, CmpTraceTargets); + InjectTraceForSwitch(F, SwitchTraceTargets); return true; } @@ -348,6 +371,45 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( } } +// For every switch statement we insert a call: +// __sanitizer_cov_trace_switch(CondValue, +// {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... }) + +void SanitizerCoverageModule::InjectTraceForSwitch( + Function &F, ArrayRef<Instruction *> SwitchTraceTargets) { + for (auto I : SwitchTraceTargets) { + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + IRBuilder<> IRB(I); + SmallVector<Constant *, 16> Initializers; + Value *Cond = SI->getCondition(); + if (Cond->getType()->getScalarSizeInBits() > + Int64Ty->getScalarSizeInBits()) + continue; + Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases())); + Initializers.push_back( + ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits())); + if (Cond->getType()->getScalarSizeInBits() < + Int64Ty->getScalarSizeInBits()) + Cond = IRB.CreateIntCast(Cond, Int64Ty, false); + for (auto It: SI->cases()) { + Constant *C = It.getCaseValue(); + if (C->getType()->getScalarSizeInBits() < + Int64Ty->getScalarSizeInBits()) + C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty); + Initializers.push_back(C); + } + ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size()); + GlobalVariable *GV = new GlobalVariable( + *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage, + ConstantArray::get(ArrayOfInt64Ty, Initializers), + "__sancov_gen_cov_switch_values"); + IRB.CreateCall(SanCovTraceSwitchFunction, + {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)}); + } + } +} + + void SanitizerCoverageModule::InjectTraceForCmp( Function &F, ArrayRef<Instruction *> CmpTraceTargets) { for (auto I : CmpTraceTargets) { @@ -369,8 +431,7 @@ void SanitizerCoverageModule::InjectTraceForCmp( void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) { I->setMetadata( - I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"), - MDNode::get(*C, None)); + I->getModule()->getMDKindID("nosanitize"), MDNode::get(*C, None)); } void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, @@ -382,34 +443,31 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, // locations. if (isa<UnreachableInst>(BB.getTerminator())) return; - BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end(); - // Skip static allocas at the top of the entry block so they don't become - // dynamic when we split the block. If we used our optimized stack layout, - // then there will only be one alloca and it will come first. - for (; IP != BE; ++IP) { - AllocaInst *AI = dyn_cast<AllocaInst>(IP); - if (!AI || !AI->isStaticAlloca()) - break; - } + BasicBlock::iterator IP = BB.getFirstInsertionPt(); bool IsEntryBB = &BB == &F.getEntryBlock(); DebugLoc EntryLoc; if (IsEntryBB) { if (auto SP = getDISubprogram(&F)) EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP); + // Keep static allocas and llvm.localescape calls in the entry block. Even + // if we aren't splitting the block, it's nice for allocas to be before + // calls. + IP = PrepareToSplitEntryBlock(BB, IP); } else { EntryLoc = IP->getDebugLoc(); } - IRBuilder<> IRB(IP); + IRBuilder<> IRB(&*IP); IRB.SetCurrentDebugLocation(EntryLoc); - SmallVector<Value *, 1> Indices; Value *GuardP = IRB.CreateAdd( IRB.CreatePointerCast(GuardArray, IntptrTy), ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4)); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); - if (UseCalls) { + if (Options.TraceBB) { + IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); + } else if (UseCalls) { IRB.CreateCall(SanCovWithCheckFunction, GuardP); } else { LoadInst *Load = IRB.CreateLoad(GuardP); @@ -418,7 +476,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, SetNoSanitizeMetadata(Load); Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); Instruction *Ins = SplitBlockAndInsertIfThen( - Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); + Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); IRB.SetInsertPoint(Ins); IRB.SetCurrentDebugLocation(EntryLoc); // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. @@ -427,7 +485,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, } if (Options.Use8bitCounters) { - IRB.SetInsertPoint(IP); + IRB.SetInsertPoint(&*IP); Value *P = IRB.CreateAdd( IRB.CreatePointerCast(EightBitCounterArray, IntptrTy), ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1)); @@ -438,13 +496,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, SetNoSanitizeMetadata(LI); SetNoSanitizeMetadata(SI); } - - if (Options.TraceBB) { - // Experimental support for tracing. - // Insert a callback with the same guard variable as used for coverage. - IRB.SetInsertPoint(IP); - IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); - } } char SanitizerCoverageModule::ID = 0; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 1a46bbb86122..9331e1d2b3fd 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -142,37 +142,35 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr)); OrdTy = IRB.getInt32Ty(); for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { - const size_t ByteSize = 1 << i; - const size_t BitSize = ByteSize * 8; - SmallString<32> ReadName("__tsan_read" + itostr(ByteSize)); + const unsigned ByteSize = 1U << i; + const unsigned BitSize = ByteSize * 8; + std::string ByteSizeStr = utostr(ByteSize); + std::string BitSizeStr = utostr(BitSize); + SmallString<32> ReadName("__tsan_read" + ByteSizeStr); TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - SmallString<32> WriteName("__tsan_write" + itostr(ByteSize)); + SmallString<32> WriteName("__tsan_write" + ByteSizeStr); TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - SmallString<64> UnalignedReadName("__tsan_unaligned_read" + - itostr(ByteSize)); + SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr); TsanUnalignedRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + - itostr(ByteSize)); + SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr); TsanUnalignedWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); Type *Ty = Type::getIntNTy(M.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); - SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + - "_load"); + SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load"); TsanAtomicLoad[i] = checkSanitizerInterfaceFunction( M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr)); - SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) + - "_store"); + SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store"); TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr)); @@ -201,7 +199,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr)); } - SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) + + SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr + "_compare_exchange_val"); TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr)); @@ -513,8 +511,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -527,8 +525,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -544,8 +542,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx]; if (!F) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -558,8 +556,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp deleted file mode 100644 index afb873a355a7..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp +++ /dev/null @@ -1,673 +0,0 @@ -//===- ARCInstKind.cpp - ObjC ARC Optimization ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file defines several utility functions used by various ARC -/// optimizations which are IMHO too big to be in a header file. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "ObjCARC.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace llvm::objcarc; - -raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, - const ARCInstKind Class) { - switch (Class) { - case ARCInstKind::Retain: - return OS << "ARCInstKind::Retain"; - case ARCInstKind::RetainRV: - return OS << "ARCInstKind::RetainRV"; - case ARCInstKind::RetainBlock: - return OS << "ARCInstKind::RetainBlock"; - case ARCInstKind::Release: - return OS << "ARCInstKind::Release"; - case ARCInstKind::Autorelease: - return OS << "ARCInstKind::Autorelease"; - case ARCInstKind::AutoreleaseRV: - return OS << "ARCInstKind::AutoreleaseRV"; - case ARCInstKind::AutoreleasepoolPush: - return OS << "ARCInstKind::AutoreleasepoolPush"; - case ARCInstKind::AutoreleasepoolPop: - return OS << "ARCInstKind::AutoreleasepoolPop"; - case ARCInstKind::NoopCast: - return OS << "ARCInstKind::NoopCast"; - case ARCInstKind::FusedRetainAutorelease: - return OS << "ARCInstKind::FusedRetainAutorelease"; - case ARCInstKind::FusedRetainAutoreleaseRV: - return OS << "ARCInstKind::FusedRetainAutoreleaseRV"; - case ARCInstKind::LoadWeakRetained: - return OS << "ARCInstKind::LoadWeakRetained"; - case ARCInstKind::StoreWeak: - return OS << "ARCInstKind::StoreWeak"; - case ARCInstKind::InitWeak: - return OS << "ARCInstKind::InitWeak"; - case ARCInstKind::LoadWeak: - return OS << "ARCInstKind::LoadWeak"; - case ARCInstKind::MoveWeak: - return OS << "ARCInstKind::MoveWeak"; - case ARCInstKind::CopyWeak: - return OS << "ARCInstKind::CopyWeak"; - case ARCInstKind::DestroyWeak: - return OS << "ARCInstKind::DestroyWeak"; - case ARCInstKind::StoreStrong: - return OS << "ARCInstKind::StoreStrong"; - case ARCInstKind::CallOrUser: - return OS << "ARCInstKind::CallOrUser"; - case ARCInstKind::Call: - return OS << "ARCInstKind::Call"; - case ARCInstKind::User: - return OS << "ARCInstKind::User"; - case ARCInstKind::IntrinsicUser: - return OS << "ARCInstKind::IntrinsicUser"; - case ARCInstKind::None: - return OS << "ARCInstKind::None"; - } - llvm_unreachable("Unknown instruction class!"); -} - -ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { - Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - - // No (mandatory) arguments. - if (AI == AE) - return StringSwitch<ARCInstKind>(F->getName()) - .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush) - .Case("clang.arc.use", ARCInstKind::IntrinsicUser) - .Default(ARCInstKind::CallOrUser); - - // One argument. - const Argument *A0 = AI++; - if (AI == AE) - // Argument is a pointer. - if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { - Type *ETy = PTy->getElementType(); - // Argument is i8*. - if (ETy->isIntegerTy(8)) - return StringSwitch<ARCInstKind>(F->getName()) - .Case("objc_retain", ARCInstKind::Retain) - .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV) - .Case("objc_retainBlock", ARCInstKind::RetainBlock) - .Case("objc_release", ARCInstKind::Release) - .Case("objc_autorelease", ARCInstKind::Autorelease) - .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV) - .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop) - .Case("objc_retainedObject", ARCInstKind::NoopCast) - .Case("objc_unretainedObject", ARCInstKind::NoopCast) - .Case("objc_unretainedPointer", ARCInstKind::NoopCast) - .Case("objc_retain_autorelease", - ARCInstKind::FusedRetainAutorelease) - .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease) - .Case("objc_retainAutoreleaseReturnValue", - ARCInstKind::FusedRetainAutoreleaseRV) - .Case("objc_sync_enter", ARCInstKind::User) - .Case("objc_sync_exit", ARCInstKind::User) - .Default(ARCInstKind::CallOrUser); - - // Argument is i8** - if (PointerType *Pte = dyn_cast<PointerType>(ETy)) - if (Pte->getElementType()->isIntegerTy(8)) - return StringSwitch<ARCInstKind>(F->getName()) - .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained) - .Case("objc_loadWeak", ARCInstKind::LoadWeak) - .Case("objc_destroyWeak", ARCInstKind::DestroyWeak) - .Default(ARCInstKind::CallOrUser); - } - - // Two arguments, first is i8**. - const Argument *A1 = AI++; - if (AI == AE) - if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) - if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) - if (Pte->getElementType()->isIntegerTy(8)) - if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { - Type *ETy1 = PTy1->getElementType(); - // Second argument is i8* - if (ETy1->isIntegerTy(8)) - return StringSwitch<ARCInstKind>(F->getName()) - .Case("objc_storeWeak", ARCInstKind::StoreWeak) - .Case("objc_initWeak", ARCInstKind::InitWeak) - .Case("objc_storeStrong", ARCInstKind::StoreStrong) - .Default(ARCInstKind::CallOrUser); - // Second argument is i8**. - if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) - if (Pte1->getElementType()->isIntegerTy(8)) - return StringSwitch<ARCInstKind>(F->getName()) - .Case("objc_moveWeak", ARCInstKind::MoveWeak) - .Case("objc_copyWeak", ARCInstKind::CopyWeak) - // Ignore annotation calls. This is important to stop the - // optimizer from treating annotations as uses which would - // make the state of the pointers they are attempting to - // elucidate to be incorrect. - .Case("llvm.arc.annotation.topdown.bbstart", - ARCInstKind::None) - .Case("llvm.arc.annotation.topdown.bbend", - ARCInstKind::None) - .Case("llvm.arc.annotation.bottomup.bbstart", - ARCInstKind::None) - .Case("llvm.arc.annotation.bottomup.bbend", - ARCInstKind::None) - .Default(ARCInstKind::CallOrUser); - } - - // Anything else. - return ARCInstKind::CallOrUser; -} - -// A whitelist of intrinsics that we know do not use objc pointers or decrement -// ref counts. -static bool isInertIntrinsic(unsigned ID) { - // TODO: Make this into a covered switch. - switch (ID) { - case Intrinsic::returnaddress: - case Intrinsic::frameaddress: - case Intrinsic::stacksave: - case Intrinsic::stackrestore: - case Intrinsic::vastart: - case Intrinsic::vacopy: - case Intrinsic::vaend: - case Intrinsic::objectsize: - case Intrinsic::prefetch: - case Intrinsic::stackprotector: - case Intrinsic::eh_return_i32: - case Intrinsic::eh_return_i64: - case Intrinsic::eh_typeid_for: - case Intrinsic::eh_dwarf_cfa: - case Intrinsic::eh_sjlj_lsda: - case Intrinsic::eh_sjlj_functioncontext: - case Intrinsic::init_trampoline: - case Intrinsic::adjust_trampoline: - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - case Intrinsic::invariant_start: - case Intrinsic::invariant_end: - // Don't let dbg info affect our results. - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - // Short cut: Some intrinsics obviously don't use ObjC pointers. - return true; - default: - return false; - } -} - -// A whitelist of intrinsics that we know do not use objc pointers or decrement -// ref counts. -static bool isUseOnlyIntrinsic(unsigned ID) { - // We are conservative and even though intrinsics are unlikely to touch - // reference counts, we white list them for safety. - // - // TODO: Expand this into a covered switch. There is a lot more here. - switch (ID) { - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: - return true; - default: - return false; - } -} - -/// \brief Determine what kind of construct V is. -ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) { - if (const Instruction *I = dyn_cast<Instruction>(V)) { - // Any instruction other than bitcast and gep with a pointer operand have a - // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer - // to a subsequent use, rather than using it themselves, in this sense. - // As a short cut, several other opcodes are known to have no pointer - // operands of interest. And ret is never followed by a release, so it's - // not interesting to examine. - switch (I->getOpcode()) { - case Instruction::Call: { - const CallInst *CI = cast<CallInst>(I); - // See if we have a function that we know something about. - if (const Function *F = CI->getCalledFunction()) { - ARCInstKind Class = GetFunctionClass(F); - if (Class != ARCInstKind::CallOrUser) - return Class; - Intrinsic::ID ID = F->getIntrinsicID(); - if (isInertIntrinsic(ID)) - return ARCInstKind::None; - if (isUseOnlyIntrinsic(ID)) - return ARCInstKind::User; - } - - // Otherwise, be conservative. - return GetCallSiteClass(CI); - } - case Instruction::Invoke: - // Otherwise, be conservative. - return GetCallSiteClass(cast<InvokeInst>(I)); - case Instruction::BitCast: - case Instruction::GetElementPtr: - case Instruction::Select: - case Instruction::PHI: - case Instruction::Ret: - case Instruction::Br: - case Instruction::Switch: - case Instruction::IndirectBr: - case Instruction::Alloca: - case Instruction::VAArg: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::SDiv: - case Instruction::UDiv: - case Instruction::FDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::SExt: - case Instruction::ZExt: - case Instruction::Trunc: - case Instruction::IntToPtr: - case Instruction::FCmp: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::InsertElement: - case Instruction::ExtractElement: - case Instruction::ShuffleVector: - case Instruction::ExtractValue: - break; - case Instruction::ICmp: - // Comparing a pointer with null, or any other constant, isn't an - // interesting use, because we don't care what the pointer points to, or - // about the values of any other dynamic reference-counted pointers. - if (IsPotentialRetainableObjPtr(I->getOperand(1))) - return ARCInstKind::User; - break; - default: - // For anything else, check all the operands. - // Note that this includes both operands of a Store: while the first - // operand isn't actually being dereferenced, it is being stored to - // memory where we can no longer track who might read it and dereference - // it, so we have to consider it potentially used. - for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); - OI != OE; ++OI) - if (IsPotentialRetainableObjPtr(*OI)) - return ARCInstKind::User; - } - } - - // Otherwise, it's totally inert for ARC purposes. - return ARCInstKind::None; -} - -/// \brief Test if the given class is a kind of user. -bool llvm::objcarc::IsUser(ARCInstKind Class) { - switch (Class) { - case ARCInstKind::User: - case ARCInstKind::CallOrUser: - case ARCInstKind::IntrinsicUser: - return true; - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::RetainBlock: - case ARCInstKind::Release: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::NoopCast: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::Call: - case ARCInstKind::None: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class is objc_retain or equivalent. -bool llvm::objcarc::IsRetain(ARCInstKind Class) { - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - return true; - // I believe we treat retain block as not a retain since it can copy its - // block. - case ARCInstKind::RetainBlock: - case ARCInstKind::Release: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::NoopCast: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class is objc_autorelease or equivalent. -bool llvm::objcarc::IsAutorelease(ARCInstKind Class) { - switch (Class) { - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - return true; - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::RetainBlock: - case ARCInstKind::Release: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::NoopCast: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -bool llvm::objcarc::IsForwarding(ARCInstKind Class) { - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::NoopCast: - return true; - case ARCInstKind::RetainBlock: - case ARCInstKind::Release: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) { - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::Release: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::RetainBlock: - return true; - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - case ARCInstKind::NoopCast: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) { - // ARCInstKind::RetainBlock may be given a stack argument. - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::AutoreleaseRV: - return true; - case ARCInstKind::Release: - case ARCInstKind::Autorelease: - case ARCInstKind::RetainBlock: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - case ARCInstKind::NoopCast: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -bool llvm::objcarc::IsNeverTail(ARCInstKind Class) { - /// It is never safe to tail call objc_autorelease since by tail calling - /// objc_autorelease: fast autoreleasing causing our object to be potentially - /// reclaimed from the autorelease pool which violates the semantics of - /// __autoreleasing types in ARC. - switch (Class) { - case ARCInstKind::Autorelease: - return true; - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::Release: - case ARCInstKind::RetainBlock: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - case ARCInstKind::NoopCast: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -bool llvm::objcarc::IsNoThrow(ARCInstKind Class) { - // objc_retainBlock is not nounwind because it calls user copy constructors - // which could theoretically throw. - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::Release: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - return true; - case ARCInstKind::RetainBlock: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::User: - case ARCInstKind::None: - case ARCInstKind::NoopCast: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -/// -/// This means that it *could* interrupt the RV optimization. -bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) { - switch (Class) { - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - return true; - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::Release: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::RetainBlock: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::User: - case ARCInstKind::None: - case ARCInstKind::NoopCast: - return false; - } - llvm_unreachable("covered switch isn't covered?"); -} - -bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) { - switch (Kind) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::NoopCast: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - case ARCInstKind::IntrinsicUser: - case ARCInstKind::User: - case ARCInstKind::None: - return false; - - // The cases below are conservative. - - // RetainBlock can result in user defined copy constructors being called - // implying releases may occur. - case ARCInstKind::RetainBlock: - case ARCInstKind::Release: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::AutoreleasepoolPop: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::StoreWeak: - case ARCInstKind::InitWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::MoveWeak: - case ARCInstKind::CopyWeak: - case ARCInstKind::DestroyWeak: - case ARCInstKind::StoreStrong: - case ARCInstKind::CallOrUser: - case ARCInstKind::Call: - return true; - } - - llvm_unreachable("covered switch isn't covered?"); -} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h deleted file mode 100644 index 636c65c9b627..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h +++ /dev/null @@ -1,123 +0,0 @@ -//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H -#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H - -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Function.h" - -namespace llvm { -namespace objcarc { - -/// \enum ARCInstKind -/// -/// \brief Equivalence classes of instructions in the ARC Model. -/// -/// Since we do not have "instructions" to represent ARC concepts in LLVM IR, -/// we instead operate on equivalence classes of instructions. -/// -/// TODO: This should be split into two enums: a runtime entry point enum -/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals -/// with effects of instructions in the ARC model (which would handle the notion -/// of a User or CallOrUser). -enum class ARCInstKind { - Retain, ///< objc_retain - RetainRV, ///< objc_retainAutoreleasedReturnValue - RetainBlock, ///< objc_retainBlock - Release, ///< objc_release - Autorelease, ///< objc_autorelease - AutoreleaseRV, ///< objc_autoreleaseReturnValue - AutoreleasepoolPush, ///< objc_autoreleasePoolPush - AutoreleasepoolPop, ///< objc_autoreleasePoolPop - NoopCast, ///< objc_retainedObject, etc. - FusedRetainAutorelease, ///< objc_retainAutorelease - FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue - LoadWeakRetained, ///< objc_loadWeakRetained (primitive) - StoreWeak, ///< objc_storeWeak (primitive) - InitWeak, ///< objc_initWeak (derived) - LoadWeak, ///< objc_loadWeak (derived) - MoveWeak, ///< objc_moveWeak (derived) - CopyWeak, ///< objc_copyWeak (derived) - DestroyWeak, ///< objc_destroyWeak (derived) - StoreStrong, ///< objc_storeStrong (derived) - IntrinsicUser, ///< clang.arc.use - CallOrUser, ///< could call objc_release and/or "use" pointers - Call, ///< could call objc_release - User, ///< could "use" a pointer - None ///< anything that is inert from an ARC perspective. -}; - -raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class); - -/// \brief Test if the given class is a kind of user. -bool IsUser(ARCInstKind Class); - -/// \brief Test if the given class is objc_retain or equivalent. -bool IsRetain(ARCInstKind Class); - -/// \brief Test if the given class is objc_autorelease or equivalent. -bool IsAutorelease(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -bool IsForwarding(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -bool IsNoopOnNull(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -bool IsAlwaysTail(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -bool IsNeverTail(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -bool IsNoThrow(ARCInstKind Class); - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -bool CanInterruptRV(ARCInstKind Class); - -/// \brief Determine if F is one of the special known Functions. If it isn't, -/// return ARCInstKind::CallOrUser. -ARCInstKind GetFunctionClass(const Function *F); - -/// \brief Determine which objc runtime call instruction class V belongs to. -/// -/// This is similar to GetARCInstKind except that it only detects objc -/// runtime calls. This allows it to be faster. -/// -static inline ARCInstKind GetBasicARCInstKind(const Value *V) { - if (const CallInst *CI = dyn_cast<CallInst>(V)) { - if (const Function *F = CI->getCalledFunction()) - return GetFunctionClass(F); - // Otherwise, be conservative. - return ARCInstKind::CallOrUser; - } - - // Otherwise, be conservative. - return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User; -} - -/// Map V to its ARCInstKind equivalence class. -ARCInstKind GetARCInstKind(const Value *V); - -/// Returns false if conservatively we can prove that any instruction mapped to -/// this kind can not decrement ref counts. Returns true otherwise. -bool CanDecrementRefCount(ARCInstKind Kind); - -} // end namespace objcarc -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 4edd02904b22..9d78e5ae3b9b 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -49,7 +49,7 @@ bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, assert(CS && "Only calls can alter reference counts!"); // See if AliasAnalysis can help us with the call. - AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); + FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); if (AliasAnalysis::onlyReadsMemory(MRB)) return false; if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { @@ -226,7 +226,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor, SmallPtrSetImpl<Instruction *> &DependingInsts, SmallPtrSetImpl<const BasicBlock *> &Visited, ProvenanceAnalysis &PA) { - BasicBlock::iterator StartPos = StartInst; + BasicBlock::iterator StartPos = StartInst->getIterator(); SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist; Worklist.push_back(std::make_pair(StartBB, StartPos)); @@ -252,7 +252,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor, break; } - Instruction *Inst = --LocalStartPos; + Instruction *Inst = &*--LocalStartPos; if (Depends(Flavor, Inst, Arg, PA)) { DependingInsts.insert(Inst); break; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp index 6ea038b8ba8c..d860723bb460 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp @@ -26,18 +26,10 @@ namespace llvm { using namespace llvm; using namespace llvm::objcarc; -/// \brief A handy option to enable/disable all ARC Optimizations. -bool llvm::objcarc::EnableARCOpts; -static cl::opt<bool, true> -EnableARCOptimizations("enable-objc-arc-opts", - cl::desc("enable/disable all ARC Optimizations"), - cl::location(EnableARCOpts), - cl::init(true)); - /// initializeObjCARCOptsPasses - Initialize all passes linked into the /// ObjCARCOpts library. void llvm::initializeObjCARCOpts(PassRegistry &Registry) { - initializeObjCARCAliasAnalysisPass(Registry); + initializeObjCARCAAWrapperPassPass(Registry); initializeObjCARCAPElimPass(Registry); initializeObjCARCExpandPass(Registry); initializeObjCARCContractPass(Registry); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h index 7595e2db1a7a..5fd45b00af17 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -26,6 +26,8 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Optional.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ObjCARCAnalysisUtils.h" +#include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" @@ -34,7 +36,6 @@ #include "llvm/Pass.h" #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Utils/Local.h" -#include "ARCInstKind.h" namespace llvm { class raw_ostream; @@ -43,99 +44,6 @@ class raw_ostream; namespace llvm { namespace objcarc { -/// \brief A handy option to enable/disable all ARC Optimizations. -extern bool EnableARCOpts; - -/// \brief Test if the given module looks interesting to run ARC optimization -/// on. -static inline bool ModuleHasARC(const Module &M) { - return - M.getNamedValue("objc_retain") || - M.getNamedValue("objc_release") || - M.getNamedValue("objc_autorelease") || - M.getNamedValue("objc_retainAutoreleasedReturnValue") || - M.getNamedValue("objc_retainBlock") || - M.getNamedValue("objc_autoreleaseReturnValue") || - M.getNamedValue("objc_autoreleasePoolPush") || - M.getNamedValue("objc_loadWeakRetained") || - M.getNamedValue("objc_loadWeak") || - M.getNamedValue("objc_destroyWeak") || - M.getNamedValue("objc_storeWeak") || - M.getNamedValue("objc_initWeak") || - M.getNamedValue("objc_moveWeak") || - M.getNamedValue("objc_copyWeak") || - M.getNamedValue("objc_retainedObject") || - M.getNamedValue("objc_unretainedObject") || - M.getNamedValue("objc_unretainedPointer") || - M.getNamedValue("clang.arc.use"); -} - -/// \brief This is a wrapper around getUnderlyingObject which also knows how to -/// look through objc_retain and objc_autorelease calls, which we know to return -/// their argument verbatim. -static inline const Value *GetUnderlyingObjCPtr(const Value *V, - const DataLayout &DL) { - for (;;) { - V = GetUnderlyingObject(V, DL); - if (!IsForwarding(GetBasicARCInstKind(V))) - break; - V = cast<CallInst>(V)->getArgOperand(0); - } - - return V; -} - -/// The RCIdentity root of a value \p V is a dominating value U for which -/// retaining or releasing U is equivalent to retaining or releasing V. In other -/// words, ARC operations on \p V are equivalent to ARC operations on \p U. -/// -/// We use this in the ARC optimizer to make it easier to match up ARC -/// operations by always mapping ARC operations to RCIdentityRoots instead of -/// pointers themselves. -/// -/// The two ways that we see RCIdentical values in ObjC are via: -/// -/// 1. PointerCasts -/// 2. Forwarding Calls that return their argument verbatim. -/// -/// Thus this function strips off pointer casts and forwarding calls. *NOTE* -/// This implies that two RCIdentical values must alias. -static inline const Value *GetRCIdentityRoot(const Value *V) { - for (;;) { - V = V->stripPointerCasts(); - if (!IsForwarding(GetBasicARCInstKind(V))) - break; - V = cast<CallInst>(V)->getArgOperand(0); - } - return V; -} - -/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just -/// casts away the const of the result. For documentation about what an -/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that -/// function. -static inline Value *GetRCIdentityRoot(Value *V) { - return const_cast<Value *>(GetRCIdentityRoot((const Value *)V)); -} - -/// \brief Assuming the given instruction is one of the special calls such as -/// objc_retain or objc_release, return the RCIdentity root of the argument of -/// the call. -static inline Value *GetArgRCIdentityRoot(Value *Inst) { - return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0)); -} - -static inline bool IsNullOrUndef(const Value *V) { - return isa<ConstantPointerNull>(V) || isa<UndefValue>(V); -} - -static inline bool IsNoopInstruction(const Instruction *I) { - return isa<BitCastInst>(I) || - (isa<GetElementPtrInst>(I) && - cast<GetElementPtrInst>(I)->hasAllZeroIndices()); -} - - /// \brief Erase the given instruction. /// /// Many ObjC calls return their argument verbatim, @@ -162,152 +70,6 @@ static inline void EraseInstruction(Instruction *CI) { RecursivelyDeleteTriviallyDeadInstructions(OldArg); } -/// \brief Test whether the given value is possible a retainable object pointer. -static inline bool IsPotentialRetainableObjPtr(const Value *Op) { - // Pointers to static or stack storage are not valid retainable object - // pointers. - if (isa<Constant>(Op) || isa<AllocaInst>(Op)) - return false; - // Special arguments can not be a valid retainable object pointer. - if (const Argument *Arg = dyn_cast<Argument>(Op)) - if (Arg->hasByValAttr() || - Arg->hasInAllocaAttr() || - Arg->hasNestAttr() || - Arg->hasStructRetAttr()) - return false; - // Only consider values with pointer types. - // - // It seemes intuitive to exclude function pointer types as well, since - // functions are never retainable object pointers, however clang occasionally - // bitcasts retainable object pointers to function-pointer type temporarily. - PointerType *Ty = dyn_cast<PointerType>(Op->getType()); - if (!Ty) - return false; - // Conservatively assume anything else is a potential retainable object - // pointer. - return true; -} - -static inline bool IsPotentialRetainableObjPtr(const Value *Op, - AliasAnalysis &AA) { - // First make the rudimentary check. - if (!IsPotentialRetainableObjPtr(Op)) - return false; - - // Objects in constant memory are not reference-counted. - if (AA.pointsToConstantMemory(Op)) - return false; - - // Pointers in constant memory are not pointing to reference-counted objects. - if (const LoadInst *LI = dyn_cast<LoadInst>(Op)) - if (AA.pointsToConstantMemory(LI->getPointerOperand())) - return false; - - // Otherwise assume the worst. - return true; -} - -/// \brief Helper for GetARCInstKind. Determines what kind of construct CS -/// is. -static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) { - for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); - I != E; ++I) - if (IsPotentialRetainableObjPtr(*I)) - return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser; - - return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call; -} - -/// \brief Return true if this value refers to a distinct and identifiable -/// object. -/// -/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses -/// special knowledge of ObjC conventions. -static inline bool IsObjCIdentifiedObject(const Value *V) { - // Assume that call results and arguments have their own "provenance". - // Constants (including GlobalVariables) and Allocas are never - // reference-counted. - if (isa<CallInst>(V) || isa<InvokeInst>(V) || - isa<Argument>(V) || isa<Constant>(V) || - isa<AllocaInst>(V)) - return true; - - if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { - const Value *Pointer = - GetRCIdentityRoot(LI->getPointerOperand()); - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { - // A constant pointer can't be pointing to an object on the heap. It may - // be reference-counted, but it won't be deleted. - if (GV->isConstant()) - return true; - StringRef Name = GV->getName(); - // These special variables are known to hold values which are not - // reference-counted pointers. - if (Name.startswith("\01l_objc_msgSend_fixup_")) - return true; - - StringRef Section = GV->getSection(); - if (Section.find("__message_refs") != StringRef::npos || - Section.find("__objc_classrefs") != StringRef::npos || - Section.find("__objc_superrefs") != StringRef::npos || - Section.find("__objc_methname") != StringRef::npos || - Section.find("__cstring") != StringRef::npos) - return true; - } - } - - return false; -} - -enum class ARCMDKindID { - ImpreciseRelease, - CopyOnEscape, - NoObjCARCExceptions, -}; - -/// A cache of MDKinds used by various ARC optimizations. -class ARCMDKindCache { - Module *M; - - /// The Metadata Kind for clang.imprecise_release metadata. - llvm::Optional<unsigned> ImpreciseReleaseMDKind; - - /// The Metadata Kind for clang.arc.copy_on_escape metadata. - llvm::Optional<unsigned> CopyOnEscapeMDKind; - - /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata. - llvm::Optional<unsigned> NoObjCARCExceptionsMDKind; - -public: - void init(Module *Mod) { - M = Mod; - ImpreciseReleaseMDKind = NoneType::None; - CopyOnEscapeMDKind = NoneType::None; - NoObjCARCExceptionsMDKind = NoneType::None; - } - - unsigned get(ARCMDKindID ID) { - switch (ID) { - case ARCMDKindID::ImpreciseRelease: - if (!ImpreciseReleaseMDKind) - ImpreciseReleaseMDKind = - M->getContext().getMDKindID("clang.imprecise_release"); - return *ImpreciseReleaseMDKind; - case ARCMDKindID::CopyOnEscape: - if (!CopyOnEscapeMDKind) - CopyOnEscapeMDKind = - M->getContext().getMDKindID("clang.arc.copy_on_escape"); - return *CopyOnEscapeMDKind; - case ARCMDKindID::NoObjCARCExceptions: - if (!NoObjCARCExceptionsMDKind) - NoObjCARCExceptionsMDKind = - M->getContext().getMDKindID("clang.arc.no_objc_arc_exceptions"); - return *NoObjCARCExceptionsMDKind; - } - llvm_unreachable("Covered switch isn't covered?!"); - } -}; - } // end namespace objcarc } // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index d318643a359a..969e77c1f888 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -72,12 +72,9 @@ bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) { if (const Function *Callee = CS.getCalledFunction()) { if (Callee->isDeclaration() || Callee->mayBeOverridden()) return true; - for (Function::const_iterator I = Callee->begin(), E = Callee->end(); - I != E; ++I) { - const BasicBlock *BB = I; - for (BasicBlock::const_iterator J = BB->begin(), F = BB->end(); - J != F; ++J) - if (ImmutableCallSite JCS = ImmutableCallSite(J)) + for (const BasicBlock &BB : *Callee) { + for (const Instruction &I : BB) + if (ImmutableCallSite JCS = ImmutableCallSite(&I)) // This recursion depth limit is arbitrary. It's just great // enough to cover known interesting testcases. if (Depth < 3 && @@ -96,7 +93,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Instruction *Push = nullptr; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; switch (GetBasicARCInstKind(Inst)) { case ARCInstKind::AutoreleasepoolPush: Push = Inst; @@ -169,7 +166,7 @@ bool ObjCARCAPElim::runOnModule(Module &M) { if (std::next(F->begin()) != F->end()) continue; // Ok, a single-block constructor function definition. Try to optimize it. - Changed |= OptimizeBB(F->begin()); + Changed |= OptimizeBB(&F->front()); } return Changed; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp deleted file mode 100644 index 3893aab76b2a..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ /dev/null @@ -1,168 +0,0 @@ -//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file defines a simple ARC-aware AliasAnalysis using special knowledge -/// of Objective C to enhance other optimization passes which rely on the Alias -/// Analysis infrastructure. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "ObjCARC.h" -#include "ObjCARCAliasAnalysis.h" -#include "llvm/IR/Instruction.h" -#include "llvm/InitializePasses.h" -#include "llvm/PassAnalysisSupport.h" -#include "llvm/PassSupport.h" - -#define DEBUG_TYPE "objc-arc-aa" - -namespace llvm { - class Function; - class Value; -} - -using namespace llvm; -using namespace llvm::objcarc; - -// Register this pass... -char ObjCARCAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa", - "ObjC-ARC-Based Alias Analysis", false, true, false) - -ImmutablePass *llvm::createObjCARCAliasAnalysisPass() { - return new ObjCARCAliasAnalysis(); -} - -bool ObjCARCAliasAnalysis::doInitialization(Module &M) { - InitializeAliasAnalysis(this, &M.getDataLayout()); - return true; -} - -void -ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AliasAnalysis::getAnalysisUsage(AU); -} - -AliasResult ObjCARCAliasAnalysis::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { - if (!EnableARCOpts) - return AliasAnalysis::alias(LocA, LocB); - - // First, strip off no-ops, including ObjC-specific no-ops, and try making a - // precise alias query. - const Value *SA = GetRCIdentityRoot(LocA.Ptr); - const Value *SB = GetRCIdentityRoot(LocB.Ptr); - AliasResult Result = - AliasAnalysis::alias(MemoryLocation(SA, LocA.Size, LocA.AATags), - MemoryLocation(SB, LocB.Size, LocB.AATags)); - if (Result != MayAlias) - return Result; - - // If that failed, climb to the underlying object, including climbing through - // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *UA = GetUnderlyingObjCPtr(SA, *DL); - const Value *UB = GetUnderlyingObjCPtr(SB, *DL); - if (UA != SA || UB != SB) { - Result = AliasAnalysis::alias(MemoryLocation(UA), MemoryLocation(UB)); - // We can't use MustAlias or PartialAlias results here because - // GetUnderlyingObjCPtr may return an offsetted pointer value. - if (Result == NoAlias) - return NoAlias; - } - - // If that failed, fail. We don't need to chain here, since that's covered - // by the earlier precise query. - return MayAlias; -} - -bool ObjCARCAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { - if (!EnableARCOpts) - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); - - // First, strip off no-ops, including ObjC-specific no-ops, and try making - // a precise alias query. - const Value *S = GetRCIdentityRoot(Loc.Ptr); - if (AliasAnalysis::pointsToConstantMemory( - MemoryLocation(S, Loc.Size, Loc.AATags), OrLocal)) - return true; - - // If that failed, climb to the underlying object, including climbing through - // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *U = GetUnderlyingObjCPtr(S, *DL); - if (U != S) - return AliasAnalysis::pointsToConstantMemory(MemoryLocation(U), OrLocal); - - // If that failed, fail. We don't need to chain here, since that's covered - // by the earlier precise query. - return false; -} - -AliasAnalysis::ModRefBehavior -ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { - // We have nothing to do. Just chain to the next AliasAnalysis. - return AliasAnalysis::getModRefBehavior(CS); -} - -AliasAnalysis::ModRefBehavior -ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { - if (!EnableARCOpts) - return AliasAnalysis::getModRefBehavior(F); - - switch (GetFunctionClass(F)) { - case ARCInstKind::NoopCast: - return DoesNotAccessMemory; - default: - break; - } - - return AliasAnalysis::getModRefBehavior(F); -} - -AliasAnalysis::ModRefResult -ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { - if (!EnableARCOpts) - return AliasAnalysis::getModRefInfo(CS, Loc); - - switch (GetBasicARCInstKind(CS.getInstruction())) { - case ARCInstKind::Retain: - case ARCInstKind::RetainRV: - case ARCInstKind::Autorelease: - case ARCInstKind::AutoreleaseRV: - case ARCInstKind::NoopCast: - case ARCInstKind::AutoreleasepoolPush: - case ARCInstKind::FusedRetainAutorelease: - case ARCInstKind::FusedRetainAutoreleaseRV: - // These functions don't access any memory visible to the compiler. - // Note that this doesn't include objc_retainBlock, because it updates - // pointers when it copies block data. - return NoModRef; - default: - break; - } - - return AliasAnalysis::getModRefInfo(CS, Loc); -} - -AliasAnalysis::ModRefResult -ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) { - // TODO: Theoretically we could check for dependencies between objc_* calls - // and OnlyAccessesArgumentPointees calls or other well-behaved calls. - return AliasAnalysis::getModRefInfo(CS1, CS2); -} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h deleted file mode 100644 index eecc82fe572c..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h +++ /dev/null @@ -1,74 +0,0 @@ -//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- C++ -*-----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares a simple ARC-aware AliasAnalysis using special knowledge -/// of Objective C to enhance other optimization passes which rely on the Alias -/// Analysis infrastructure. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H -#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H - -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Pass.h" - -namespace llvm { -namespace objcarc { - - /// \brief This is a simple alias analysis implementation that uses knowledge - /// of ARC constructs to answer queries. - /// - /// TODO: This class could be generalized to know about other ObjC-specific - /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing - /// even though their offsets are dynamic. - class ObjCARCAliasAnalysis : public ImmutablePass, - public AliasAnalysis { - public: - static char ID; // Class identification, replacement for typeinfo - ObjCARCAliasAnalysis() : ImmutablePass(ID) { - initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry()); - } - - private: - bool doInitialization(Module &M) override; - - /// This method is used when a pass implements an analysis interface through - /// multiple inheritance. If needed, it should override this to adjust the - /// this pointer as needed for the specified pass info. - void *getAdjustedAnalysisPointer(const void *PI) override { - if (PI == &AliasAnalysis::ID) - return static_cast<AliasAnalysis *>(this); - return this; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override; - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override; - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; - ModRefBehavior getModRefBehavior(const Function *F) override; - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override; - }; - -} // namespace objcarc -} // namespace llvm - -#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index baca76ba3f2a..1cdf5689f42a 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -119,9 +119,9 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) { return false; // Check that the call is next to the retain. - BasicBlock::const_iterator I = Call; - ++I; - while (IsNoopInstruction(I)) ++I; + BasicBlock::const_iterator I = ++Call->getIterator(); + while (IsNoopInstruction(&*I)) + ++I; if (&*I != Retain) return false; @@ -247,7 +247,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load, // Ok, now we know we have not seen a store yet. See if Inst can write to // our load location, if it can not, just ignore the instruction. - if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod)) + if (!(AA->getModRefInfo(Inst, Loc) & MRI_Mod)) continue; Store = dyn_cast<StoreInst>(Inst); @@ -282,9 +282,9 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store, Instruction *Release, ProvenanceAnalysis &PA) { // Walk up from the Store to find the retain. - BasicBlock::iterator I = Store; + BasicBlock::iterator I = Store->getIterator(); BasicBlock::iterator Begin = Store->getParent()->begin(); - while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) { + while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) { Instruction *Inst = &*I; // It is only safe to move the retain to the store if we can prove @@ -294,7 +294,7 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store, return nullptr; --I; } - Instruction *Retain = I; + Instruction *Retain = &*I; if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain) return nullptr; if (GetArgRCIdentityRoot(Retain) != New) @@ -429,7 +429,7 @@ bool ObjCARCContract::tryToPeepholeInstruction( // insert it now. if (!RetainRVMarker) return false; - BasicBlock::iterator BBI = Inst; + BasicBlock::iterator BBI = Inst->getIterator(); BasicBlock *InstParent = Inst->getParent(); // Step up to see if the call immediately precedes the RetainRV call. @@ -440,11 +440,11 @@ bool ObjCARCContract::tryToPeepholeInstruction( BasicBlock *Pred = InstParent->getSinglePredecessor(); if (!Pred) goto decline_rv_optimization; - BBI = Pred->getTerminator(); + BBI = Pred->getTerminator()->getIterator(); break; } --BBI; - } while (IsNoopInstruction(BBI)); + } while (IsNoopInstruction(&*BBI)); if (&*BBI == GetArgRCIdentityRoot(Inst)) { DEBUG(dbgs() << "Adding inline asm marker for " @@ -511,10 +511,10 @@ bool ObjCARCContract::runOnFunction(Function &F) { return false; Changed = false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - PA.setAA(&getAnalysis<AliasAnalysis>()); + PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n"); @@ -629,13 +629,13 @@ bool ObjCARCContract::runOnFunction(Function &F) { char ObjCARCContract::ID = 0; INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract", "ObjC ARC contraction", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract", "ObjC ARC contraction", false, false) void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 9edbb17e8d1b..f0ee6e2be487 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -28,7 +28,6 @@ #include "ARCRuntimeEntryPoints.h" #include "BlotMapVector.h" #include "DependencyAnalysis.h" -#include "ObjCARCAliasAnalysis.h" #include "ProvenanceAnalysis.h" #include "PtrState.h" #include "llvm/ADT/DenseMap.h" @@ -36,6 +35,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ObjCARCAliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" @@ -482,7 +482,7 @@ namespace { /// A flag indicating whether this optimization pass should run. bool Run; - /// Flags which determine whether each of the interesting runtine functions + /// Flags which determine whether each of the interesting runtime functions /// is in fact used in the current function. unsigned UsedInThisFunction; @@ -556,7 +556,7 @@ namespace { char ObjCARCOpt::ID = 0; INITIALIZE_PASS_BEGIN(ObjCARCOpt, "objc-arc", "ObjC ARC optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass) INITIALIZE_PASS_END(ObjCARCOpt, "objc-arc", "ObjC ARC optimization", false, false) @@ -565,8 +565,8 @@ Pass *llvm::createObjCARCOptPass() { } void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<ObjCARCAliasAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ObjCARCAAWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); // ARC optimization doesn't currently split critical edges. AU.setPreservesCFG(); } @@ -581,16 +581,18 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { ImmutableCallSite CS(Arg); if (const Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { - BasicBlock::const_iterator I = Call; + BasicBlock::const_iterator I(Call); ++I; - while (IsNoopInstruction(I)) ++I; + while (IsNoopInstruction(&*I)) + ++I; if (&*I == RetainRV) return false; } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) { BasicBlock *RetainRVParent = RetainRV->getParent(); if (II->getNormalDest() == RetainRVParent) { BasicBlock::const_iterator I = RetainRVParent->begin(); - while (IsNoopInstruction(I)) ++I; + while (IsNoopInstruction(&*I)) + ++I; if (&*I == RetainRV) return false; } @@ -599,18 +601,21 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for being preceded by an objc_autoreleaseReturnValue on the same // pointer. In this case, we can delete the pair. - BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); + BasicBlock::iterator I = RetainRV->getIterator(), + Begin = RetainRV->getParent()->begin(); if (I != Begin) { - do --I; while (I != Begin && IsNoopInstruction(I)); - if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV && - GetArgRCIdentityRoot(I) == Arg) { + do + --I; + while (I != Begin && IsNoopInstruction(&*I)); + if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV && + GetArgRCIdentityRoot(&*I) == Arg) { Changed = true; ++NumPeeps; DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n" << "Erasing " << *RetainRV << "\n"); - EraseInstruction(I); + EraseInstruction(&*I); EraseInstruction(RetainRV); return true; } @@ -1216,7 +1221,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Visit all the instructions, bottom-up. for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { - Instruction *Inst = std::prev(I); + Instruction *Inst = &*std::prev(I); // Invoke instructions are visited as part of their successors (below). if (isa<InvokeInst>(Inst)) @@ -1264,7 +1269,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, Arg = GetArgRCIdentityRoot(Inst); TopDownPtrState &S = MyStates.getPtrTopDownState(Arg); NestingDetected |= S.InitTopDown(Class, Inst); - // A retain can be a potential use; procede to the generic checking + // A retain can be a potential use; proceed to the generic checking // code below. break; } @@ -1342,12 +1347,10 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, << "Performing Dataflow:\n"); // Visit all the instructions, top-down. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - Instruction *Inst = I; + for (Instruction &Inst : *BB) { + DEBUG(dbgs() << " Visiting " << Inst << "\n"); - DEBUG(dbgs() << " Visiting " << *Inst << "\n"); - - NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates); + NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates); } DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n" @@ -1413,16 +1416,15 @@ ComputePostOrders(Function &F, // Functions may have many exits, and there also blocks which we treat // as exits due to ignored edges. SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *ExitBB = I; - BBState &MyStates = BBStates[ExitBB]; + for (BasicBlock &ExitBB : F) { + BBState &MyStates = BBStates[&ExitBB]; if (!MyStates.isExit()) continue; MyStates.SetAsExit(); - PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin())); - Visited.insert(ExitBB); + PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin())); + Visited.insert(&ExitBB); while (!PredStack.empty()) { reverse_dfs_next_succ: BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); @@ -1830,7 +1832,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // analysis too, but that would want caching. A better approach would be to // use the technique that EarlyCSE uses. inst_iterator Current = std::prev(I); - BasicBlock *CurrentBB = Current.getBasicBlockIterator(); + BasicBlock *CurrentBB = &*Current.getBasicBlockIterator(); for (BasicBlock::iterator B = CurrentBB->begin(), J = Current.getInstructionIterator(); J != B; --J) { @@ -2008,10 +2010,7 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain, // Check that the call is a regular call. ARCInstKind Class = GetBasicARCInstKind(Call); - if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call) - return false; - - return true; + return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call; } /// Find a dependent retain that precedes the given autorelease for which there @@ -2081,9 +2080,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { SmallPtrSet<Instruction *, 4> DependingInstructions; SmallPtrSet<const BasicBlock *, 4> Visited; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { - BasicBlock *BB = FI; - ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); + for (BasicBlock &BB: F) { + ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back()); DEBUG(dbgs() << "Visiting: " << *Ret << "\n"); @@ -2095,19 +2093,16 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { // Look for an ``autorelease'' instruction that is a predecessor of Ret and // dependent on Arg such that there are no instructions dependent on Arg // that need a positive ref count in between the autorelease and Ret. - CallInst *Autorelease = - FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret, - DependingInstructions, Visited, - PA); + CallInst *Autorelease = FindPredecessorAutoreleaseWithSafePath( + Arg, &BB, Ret, DependingInstructions, Visited, PA); DependingInstructions.clear(); Visited.clear(); if (!Autorelease) continue; - CallInst *Retain = - FindPredecessorRetainWithSafePath(Arg, BB, Autorelease, - DependingInstructions, Visited, PA); + CallInst *Retain = FindPredecessorRetainWithSafePath( + Arg, &BB, Autorelease, DependingInstructions, Visited, PA); DependingInstructions.clear(); Visited.clear(); @@ -2192,7 +2187,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) { DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>" "\n"); - PA.setAA(&getAnalysis<AliasAnalysis>()); + PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); #ifndef NDEBUG if (AreStatisticsEnabled()) { diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 0ac41d3ea326..1a12b659e5a3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -26,10 +26,10 @@ #define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/AliasAnalysis.h" namespace llvm { class Value; - class AliasAnalysis; class DataLayout; class PHINode; class SelectInst; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp index 0be75af52014..c274e8182fb5 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp @@ -35,7 +35,7 @@ char PAEval::ID = 0; PAEval::PAEval() : FunctionPass(ID) {} void PAEval::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); } static StringRef getName(Value *V) { @@ -65,7 +65,7 @@ bool PAEval::runOnFunction(Function &F) { } ProvenanceAnalysis PA; - PA.setAA(&getAnalysis<AliasAnalysis>()); + PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); const DataLayout &DL = F.getParent()->getDataLayout(); for (Value *V1 : Values) { @@ -89,6 +89,6 @@ FunctionPass *llvm::createPAEvalPass() { return new PAEval(); } INITIALIZE_PASS_BEGIN(PAEval, "pa-eval", "Evaluate ProvenanceAnalysis on all pairs", false, true) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(PAEval, "pa-eval", "Evaluate ProvenanceAnalysis on all pairs", false, true) diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp index ae20e7e6d347..df64fa32f3f8 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp @@ -256,9 +256,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, // one of its successor blocks, since we can't insert code after it // in its own block, and we don't want to split critical edges. if (isa<InvokeInst>(Inst)) - InsertReverseInsertPt(BB->getFirstInsertionPt()); + InsertReverseInsertPt(&*BB->getFirstInsertionPt()); else - InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); + InsertReverseInsertPt(&*++Inst->getIterator()); SetSeq(S_Use); } else if (Seq == S_Release && IsUser(Class)) { DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq() << "; " @@ -268,9 +268,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, assert(!HasReverseInsertPts()); // As above; handle invoke specially. if (isa<InvokeInst>(Inst)) - InsertReverseInsertPt(BB->getFirstInsertionPt()); + InsertReverseInsertPt(&*BB->getFirstInsertionPt()); else - InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); + InsertReverseInsertPt(&*++Inst->getIterator()); } break; case S_Stop: diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h index e45e1ea96c53..9749e44822b2 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h @@ -17,8 +17,8 @@ #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H #define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H -#include "ARCInstKind.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Value.h" #include "llvm/Support/raw_ostream.h" @@ -96,7 +96,7 @@ struct RRInfo { }; /// \brief This class summarizes several per-pointer runtime properties which -/// are propogated through the flow graph. +/// are propagated through the flow graph. class PtrState { protected: /// True if the reference count is known to be incremented. @@ -172,7 +172,7 @@ struct BottomUpPtrState : PtrState { bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I); /// Return true if this set of releases can be paired with a release. Modifies - /// state appropriately to reflect that the matching occured if it is + /// state appropriately to reflect that the matching occurred if it is /// successful. /// /// It is assumed that one has already checked that the RCIdentity of the @@ -194,7 +194,7 @@ struct TopDownPtrState : PtrState { /// Return true if this set of retains can be paired with the given /// release. Modifies state appropriately to reflect that the matching - /// occured. + /// occurred. bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release); void HandlePotentialUse(Instruction *Inst, const Value *Ptr, diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index d6fc91641588..590a52da6b19 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -1,4 +1,4 @@ -//===- DCE.cpp - Code to perform dead code elimination --------------------===// +//===- ADCE.cpp - Code to perform dead code elimination -------------------===// // // The LLVM Compiler Infrastructure // @@ -14,52 +14,33 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "adce" STATISTIC(NumRemoved, "Number of instructions removed"); -namespace { -struct ADCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) { - initializeADCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; - - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - } -}; -} - -char ADCE::ID = 0; -INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) - -bool ADCE::runOnFunction(Function& F) { - if (skipOptnoneFunction(F)) - return false; - +static bool aggressiveDCE(Function& F) { SmallPtrSet<Instruction*, 128> Alive; SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. - for (Instruction &I : inst_range(F)) { - if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || - isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { + for (Instruction &I : instructions(F)) { + if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() || + I.mayHaveSideEffects()) { Alive.insert(&I); Worklist.push_back(&I); } @@ -79,7 +60,7 @@ bool ADCE::runOnFunction(Function& F) { // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { if (!Alive.count(&I)) { Worklist.push_back(&I); I.dropAllReferences(); @@ -94,6 +75,34 @@ bool ADCE::runOnFunction(Function& F) { return !Worklist.empty(); } -FunctionPass *llvm::createAggressiveDCEPass() { - return new ADCE(); +PreservedAnalyses ADCEPass::run(Function &F) { + if (aggressiveDCE(F)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } + +namespace { +struct ADCELegacyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCELegacyPass() : FunctionPass(ID) { + initializeADCELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function& F) override { + if (skipOptnoneFunction(F)) + return false; + return aggressiveDCE(F); + } + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} + +char ADCELegacyPass::ID = 0; +INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", + false, false) + +FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 8918909f484a..4b721d38adba 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -21,6 +21,8 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -54,13 +56,15 @@ struct AlignmentFromAssumptions : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); } // For memory transfers, we need a common alignment for both the source and @@ -84,7 +88,7 @@ INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) @@ -249,8 +253,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, // The mask must have some trailing ones (otherwise the condition is // trivial and tells us nothing about the alignment of the left operand). - unsigned TrailingOnes = - MaskSCEV->getValue()->getValue().countTrailingOnes(); + unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes(); if (!TrailingOnes) return false; @@ -270,7 +273,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, OffSCEV = nullptr; if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) { AAPtr = PToI->getPointerOperand(); - OffSCEV = SE->getConstant(Int64Ty, 0); + OffSCEV = SE->getZero(Int64Ty); } else if (const SCEVAddExpr* AndLHSAddSCEV = dyn_cast<SCEVAddExpr>(AndLHSSCEV)) { // Try to find the ptrtoint; subtract it and the rest is the offset. @@ -410,7 +413,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { bool AlignmentFromAssumptions::runOnFunction(Function &F) { bool Changed = false; auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); NewDestAlignments.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp index 09c605e76737..cb9b8b6fffc8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -15,26 +15,18 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/BasicBlock.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" - using namespace llvm; #define DEBUG_TYPE "bdce" @@ -53,342 +45,42 @@ struct BDCE : public FunctionPass { void getAnalysisUsage(AnalysisUsage& AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DemandedBits>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } - - void determineLiveOperandBits(const Instruction *UserI, - const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2); - - AssumptionCache *AC; - DominatorTree *DT; }; } char BDCE::ID = 0; INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", false, false) -static bool isAlwaysLive(Instruction *I) { - return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || - isa<LandingPadInst>(I) || I->mayHaveSideEffects(); -} - -void BDCE::determineLiveOperandBits(const Instruction *UserI, - const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2) { - unsigned BitWidth = AB.getBitWidth(); - - // We're called once per operand, but for some instructions, we need to - // compute known bits of both operands in order to determine the live bits of - // either (when both operands are instructions themselves). We don't, - // however, want to do this twice, so we cache the result in APInts that live - // in the caller. For the two-relevant-operands case, both operand values are - // provided here. - auto ComputeKnownBits = - [&](unsigned BitWidth, const Value *V1, const Value *V2) { - const DataLayout &DL = I->getModule()->getDataLayout(); - KnownZero = APInt(BitWidth, 0); - KnownOne = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, - AC, UserI, DT); - - if (V2) { - KnownZero2 = APInt(BitWidth, 0); - KnownOne2 = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, - 0, AC, UserI, DT); - } - }; - - switch (UserI->getOpcode()) { - default: break; - case Instruction::Call: - case Instruction::Invoke: - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::bswap: - // The alive bits of the input are the swapped alive bits of - // the output. - AB = AOut.byteSwap(); - break; - case Intrinsic::ctlz: - if (OperandNo == 0) { - // We need some output bits, so we need all bits of the - // input to the left of, and including, the leftmost bit - // known to be one. - ComputeKnownBits(BitWidth, I, nullptr); - AB = APInt::getHighBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countLeadingZeros()+1)); - } - break; - case Intrinsic::cttz: - if (OperandNo == 0) { - // We need some output bits, so we need all bits of the - // input to the right of, and including, the rightmost bit - // known to be one. - ComputeKnownBits(BitWidth, I, nullptr); - AB = APInt::getLowBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countTrailingZeros()+1)); - } - break; - } - break; - case Instruction::Add: - case Instruction::Sub: - // Find the highest live output bit. We don't need any more input - // bits than that (adds, and thus subtracts, ripple only to the - // left). - AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); - break; - case Instruction::Shl: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.lshr(ShiftAmt); - - // If the shift is nuw/nsw, then the high bits are not dead - // (because we've promised that they *must* be zero). - const ShlOperator *S = cast<ShlOperator>(UserI); - if (S->hasNoSignedWrap()) - AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); - else if (S->hasNoUnsignedWrap()) - AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::LShr: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.shl(ShiftAmt); - - // If the shift is exact, then the low bits are not dead - // (they must be zero). - if (cast<LShrOperator>(UserI)->isExact()) - AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::AShr: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.shl(ShiftAmt); - // Because the high input bit is replicated into the - // high-order bits of the result, if we need any of those - // bits, then we must keep the highest input bit. - if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) - .getBoolValue()) - AB.setBit(BitWidth-1); - - // If the shift is exact, then the low bits are not dead - // (they must be zero). - if (cast<AShrOperator>(UserI)->isExact()) - AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::And: - AB = AOut; - - // For bits that are known zero, the corresponding bits in the - // other operand are dead (unless they're both zero, in which - // case they can't both be dead, so just mark the LHS bits as - // dead). - if (OperandNo == 0) { - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownZero2; - } else { - if (!isa<Instruction>(UserI->getOperand(0))) - ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownZero & ~KnownZero2); - } - break; - case Instruction::Or: - AB = AOut; - - // For bits that are known one, the corresponding bits in the - // other operand are dead (unless they're both one, in which - // case they can't both be dead, so just mark the LHS bits as - // dead). - if (OperandNo == 0) { - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownOne2; - } else { - if (!isa<Instruction>(UserI->getOperand(0))) - ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownOne & ~KnownOne2); - } - break; - case Instruction::Xor: - case Instruction::PHI: - AB = AOut; - break; - case Instruction::Trunc: - AB = AOut.zext(BitWidth); - break; - case Instruction::ZExt: - AB = AOut.trunc(BitWidth); - break; - case Instruction::SExt: - AB = AOut.trunc(BitWidth); - // Because the high input bit is replicated into the - // high-order bits of the result, if we need any of those - // bits, then we must keep the highest input bit. - if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), - AOut.getBitWidth() - BitWidth)) - .getBoolValue()) - AB.setBit(BitWidth-1); - break; - case Instruction::Select: - if (OperandNo != 0) - AB = AOut; - break; - } -} - bool BDCE::runOnFunction(Function& F) { if (skipOptnoneFunction(F)) return false; + DemandedBits &DB = getAnalysis<DemandedBits>(); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - DenseMap<Instruction *, APInt> AliveBits; SmallVector<Instruction*, 128> Worklist; - - // The set of visited instructions (non-integer-typed only). - SmallPtrSet<Instruction*, 128> Visited; - - // Collect the set of "root" instructions that are known live. - for (Instruction &I : inst_range(F)) { - if (!isAlwaysLive(&I)) - continue; - - DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); - // For integer-valued instructions, set up an initial empty set of alive - // bits and add the instruction to the work list. For other instructions - // add their operands to the work list (for integer values operands, mark - // all bits as live). - if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { - if (!AliveBits.count(&I)) { - AliveBits[&I] = APInt(IT->getBitWidth(), 0); - Worklist.push_back(&I); - } - - continue; - } - - // Non-integer-typed instructions... - for (Use &OI : I.operands()) { - if (Instruction *J = dyn_cast<Instruction>(OI)) { - if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) - AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); - Worklist.push_back(J); - } - } - // To save memory, we don't add I to the Visited set here. Instead, we - // check isAlwaysLive on every instruction when searching for dead - // instructions later (we need to check isAlwaysLive for the - // integer-typed instructions anyway). - } - - // Propagate liveness backwards to operands. - while (!Worklist.empty()) { - Instruction *UserI = Worklist.pop_back_val(); - - DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); - APInt AOut; - if (UserI->getType()->isIntegerTy()) { - AOut = AliveBits[UserI]; - DEBUG(dbgs() << " Alive Out: " << AOut); - } - DEBUG(dbgs() << "\n"); - - if (!UserI->getType()->isIntegerTy()) - Visited.insert(UserI); - - APInt KnownZero, KnownOne, KnownZero2, KnownOne2; - // Compute the set of alive bits for each operand. These are anded into the - // existing set, if any, and if that changes the set of alive bits, the - // operand is added to the work-list. - for (Use &OI : UserI->operands()) { - if (Instruction *I = dyn_cast<Instruction>(OI)) { - if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { - unsigned BitWidth = IT->getBitWidth(); - APInt AB = APInt::getAllOnesValue(BitWidth); - if (UserI->getType()->isIntegerTy() && !AOut && - !isAlwaysLive(UserI)) { - AB = APInt(BitWidth, 0); - } else { - // If all bits of the output are dead, then all bits of the input - // Bits of each operand that are used to compute alive bits of the - // output are alive, all others are dead. - determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, - KnownZero, KnownOne, - KnownZero2, KnownOne2); - } - - // If we've added to the set of alive bits (or the operand has not - // been previously visited), then re-queue the operand to be visited - // again. - APInt ABPrev(BitWidth, 0); - auto ABI = AliveBits.find(I); - if (ABI != AliveBits.end()) - ABPrev = ABI->second; - - APInt ABNew = AB | ABPrev; - if (ABNew != ABPrev || ABI == AliveBits.end()) { - AliveBits[I] = std::move(ABNew); - Worklist.push_back(I); - } - } else if (!Visited.count(I)) { - Worklist.push_back(I); - } - } - } - } - bool Changed = false; - // The inverse of the live set is the dead set. These are those instructions - // which have no side effects and do not influence the control flow or return - // value of the function, and may therefore be deleted safely. - // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : inst_range(F)) { - // For live instructions that have all dead bits, first make them dead by - // replacing all uses with something else. Then, if they don't need to - // remain live (because they have side effects, etc.) we can remove them. - if (I.getType()->isIntegerTy()) { - auto ABI = AliveBits.find(&I); - if (ABI != AliveBits.end()) { - if (ABI->second.getBoolValue()) - continue; - - DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); - // FIXME: In theory we could substitute undef here instead of zero. - // This should be reconsidered once we settle on the semantics of - // undef, poison, etc. - Value *Zero = ConstantInt::get(I.getType(), 0); - ++NumSimplified; - I.replaceAllUsesWith(Zero); - Changed = true; - } - } else if (Visited.count(&I)) { - continue; + for (Instruction &I : instructions(F)) { + if (I.getType()->isIntegerTy() && + !DB.getDemandedBits(&I).getBoolValue()) { + // For live instructions that have all dead bits, first make them dead by + // replacing all uses with something else. Then, if they don't need to + // remain live (because they have side effects, etc.) we can remove them. + DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + // FIXME: In theory we could substitute undef here instead of zero. + // This should be reconsidered once we settle on the semantics of + // undef, poison, etc. + Value *Zero = ConstantInt::get(I.getType(), 0); + ++NumSimplified; + I.replaceAllUsesWith(Zero); + Changed = true; } - - if (isAlwaysLive(&I)) + if (!DB.isInstructionDead(&I)) continue; Worklist.push_back(&I); diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 4288742dd3eb..84f7f5fff5b5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -223,10 +223,10 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, } // The simple and common case. This also includes constant expressions. - if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst)) + if (!isa<PHINode>(Inst) && !Inst->isEHPad()) return Inst; - // We can't insert directly before a phi node or landing pad. Insert before + // We can't insert directly before a phi node or an eh pad. Insert before // the terminator of the incoming or dominating block. assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!"); if (Idx != ~0U && isa<PHINode>(Inst)) @@ -365,9 +365,9 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, /// into an instruction itself. void ConstantHoisting::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; - for (Function::iterator BB : Fn) - for (BasicBlock::iterator Inst : *BB) - collectConstantCandidates(ConstCandMap, Inst); + for (BasicBlock &BB : Fn) + for (Instruction &Inst : BB) + collectConstantCandidates(ConstCandMap, &Inst); } /// \brief Find the base constant within the given range and rebase all other diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 79624b2e4c47..686bd4071104 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/IR/CFG.h" @@ -32,6 +33,7 @@ STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumSelects, "Number of selects propagated"); STATISTIC(NumMemAccess, "Number of memory access targets propagated"); STATISTIC(NumCmps, "Number of comparisons propagated"); +STATISTIC(NumReturns, "Number of return values propagated"); STATISTIC(NumDeadCases, "Number of switch cases removed"); namespace { @@ -43,6 +45,11 @@ namespace { bool processMemAccess(Instruction *I); bool processCmp(CmpInst *C); bool processSwitch(SwitchInst *SI); + bool processCallSite(CallSite CS); + + /// Return a constant value for V usable at At and everything it + /// dominates. If no such Constant can be found, return nullptr. + Constant *getConstantAt(Value *V, Instruction *At); public: static char ID; @@ -54,6 +61,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } @@ -178,44 +186,33 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { return true; } -/// processCmp - If the value of this comparison could be determined locally, -/// constant propagation would already have figured it out. Instead, walk -/// the predecessors and statically evaluate the comparison based on information -/// available on that edge. If a given static evaluation is true on ALL -/// incoming edges, then it's true universally and we can simplify the compare. +/// processCmp - See if LazyValueInfo's ability to exploit edge conditions, +/// or range information is sufficient to prove this comparison. Even for +/// local conditions, this can sometimes prove conditions instcombine can't by +/// exploiting range information. bool CorrelatedValuePropagation::processCmp(CmpInst *C) { Value *Op0 = C->getOperand(0); - if (isa<Instruction>(Op0) && - cast<Instruction>(Op0)->getParent() == C->getParent()) - return false; - Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); if (!Op1) return false; - pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); - if (PI == PE) return false; + // As a policy choice, we choose not to waste compile time on anything where + // the comparison is testing local values. While LVI can sometimes reason + // about such cases, it's not its primary purpose. We do make sure to do + // the block local query for uses from terminator instructions, but that's + // handled in the code for each terminator. + auto *I = dyn_cast<Instruction>(Op0); + if (I && I->getParent() == C->getParent()) + return false; - LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, - C->getParent(), C); + LazyValueInfo::Tristate Result = + LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C); if (Result == LazyValueInfo::Unknown) return false; - ++PI; - while (PI != PE) { - LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, - C->getParent(), C); - if (Res != Result) return false; - ++PI; - } - ++NumCmps; - if (Result == LazyValueInfo::True) C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); else C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); - C->eraseFromParent(); return true; @@ -307,6 +304,59 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { return Changed; } +/// processCallSite - Infer nonnull attributes for the arguments at the +/// specified callsite. +bool CorrelatedValuePropagation::processCallSite(CallSite CS) { + SmallVector<unsigned, 4> Indices; + unsigned ArgNo = 0; + + for (Value *V : CS.args()) { + PointerType *Type = dyn_cast<PointerType>(V->getType()); + + if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && + LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, + ConstantPointerNull::get(Type), + CS.getInstruction()) == LazyValueInfo::False) + Indices.push_back(ArgNo + 1); + ArgNo++; + } + + assert(ArgNo == CS.arg_size() && "sanity check"); + + if (Indices.empty()) + return false; + + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + + return true; +} + +Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { + if (Constant *C = LVI->getConstant(V, At->getParent(), At)) + return C; + + // TODO: The following really should be sunk inside LVI's core algorithm, or + // at least the outer shims around such. + auto *C = dyn_cast<CmpInst>(V); + if (!C) return nullptr; + + Value *Op0 = C->getOperand(0); + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); + if (!Op1) return nullptr; + + LazyValueInfo::Tristate Result = + LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At); + if (Result == LazyValueInfo::Unknown) + return nullptr; + + return (Result == LazyValueInfo::True) ? + ConstantInt::getTrue(C->getContext()) : + ConstantInt::getFalse(C->getContext()); +} + bool CorrelatedValuePropagation::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; @@ -318,7 +368,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { bool BBChanged = false; for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { - Instruction *II = BI++; + Instruction *II = &*BI++; switch (II->getOpcode()) { case Instruction::Select: BBChanged |= processSelect(cast<SelectInst>(II)); @@ -334,6 +384,10 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { case Instruction::Store: BBChanged |= processMemAccess(II); break; + case Instruction::Call: + case Instruction::Invoke: + BBChanged |= processCallSite(CallSite(II)); + break; } } @@ -342,7 +396,21 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { case Instruction::Switch: BBChanged |= processSwitch(cast<SwitchInst>(Term)); break; + case Instruction::Ret: { + auto *RI = cast<ReturnInst>(Term); + // Try to determine the return value if we can. This is mainly here to + // simplify the writing of unit tests, but also helps to enable IPO by + // constant folding the return values of callees. + auto *RetVal = RI->getReturnValue(); + if (!RetVal) break; // handle "ret void" + if (isa<Constant>(RetVal)) break; // nothing to do + if (auto *C = getConstantAt(RetVal, RI)) { + ++NumReturns; + RI->replaceUsesOfWith(RetVal, C); + BBChanged = true; + } } + }; FnChanged |= BBChanged; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index 3b262a23091f..b67c3c7742fd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" @@ -46,7 +47,7 @@ namespace { TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { - Instruction *Inst = DI++; + Instruction *Inst = &*DI++; if (isInstructionTriviallyDead(Inst, TLI)) { Inst->eraseFromParent(); Changed = true; @@ -92,6 +93,34 @@ namespace { char DCE::ID = 0; INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) +static bool DCEInstruction(Instruction *I, + SmallSetVector<Instruction *, 16> &WorkList, + const TargetLibraryInfo *TLI) { + if (isInstructionTriviallyDead(I, TLI)) { + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty() || I == OpV) + continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast<Instruction>(OpV)) + if (isInstructionTriviallyDead(OpI, TLI)) + WorkList.insert(OpI); + } + + I->eraseFromParent(); + ++DCEEliminated; + return true; + } + return false; +} + bool DCE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; @@ -99,39 +128,24 @@ bool DCE::runOnFunction(Function &F) { auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - // Start out with all of the instructions in the worklist... - std::vector<Instruction*> WorkList; - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) - WorkList.push_back(&*i); - - // Loop over the worklist finding instructions that are dead. If they are - // dead make them drop all of their uses, making other instructions - // potentially dead, and work until the worklist is empty. - // bool MadeChange = false; + SmallSetVector<Instruction *, 16> WorkList; + // Iterate over the original function, only adding insts to the worklist + // if they actually need to be revisited. This avoids having to pre-init + // the worklist with the entire function's worth of instructions. + for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) { + Instruction *I = &*FI; + ++FI; + + // We're visiting this instruction now, so make sure it's not in the + // worklist from an earlier visit. + if (!WorkList.count(I)) + MadeChange |= DCEInstruction(I, WorkList, TLI); + } + while (!WorkList.empty()) { - Instruction *I = WorkList.back(); - WorkList.pop_back(); - - if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead. - // Loop over all of the values that the instruction uses, if there are - // instructions being used, add them to the worklist, because they might - // go dead after this one is removed. - // - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *Used = dyn_cast<Instruction>(*OI)) - WorkList.push_back(Used); - - // Remove the instruction. - I->eraseFromParent(); - - // Remove the instruction from the worklist if it still exists in it. - WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), - WorkList.end()); - - MadeChange = true; - ++DCEEliminated; - } + Instruction *I = WorkList.pop_back_val(); + MadeChange |= DCEInstruction(I, WorkList, TLI); } return MadeChange; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index c50558434da2..36ad0a5f7b91 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -40,6 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "dse" +STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); @@ -59,23 +61,24 @@ namespace { if (skipOptnoneFunction(F)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = AA->getTargetLibraryInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + for (BasicBlock &I : F) // Only check non-dead blocks. Dead blocks may have strange pointer // cycles that will confuse alias analysis. - if (DT->isReachableFromEntry(I)) - Changed |= runOnBasicBlock(*I); + if (DT->isReachableFromEntry(&I)) + Changed |= runOnBasicBlock(I); AA = nullptr; MD = nullptr; DT = nullptr; return Changed; } bool runOnBasicBlock(BasicBlock &BB); + bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI); bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const MemoryLocation &LoadedLoc, @@ -85,10 +88,11 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } }; @@ -97,8 +101,10 @@ namespace { char DSE::ID = 0; INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } @@ -115,7 +121,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, - const TargetLibraryInfo *TLI, + const TargetLibraryInfo &TLI, SmallSetVector<Value*, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -140,7 +146,7 @@ static void DeleteDeadInstruction(Instruction *I, if (!Op->use_empty()) continue; if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI, TLI)) + if (isInstructionTriviallyDead(OpI, &TLI)) NowDeadInsts.push_back(OpI); } @@ -153,7 +159,7 @@ static void DeleteDeadInstruction(Instruction *I, /// hasMemoryWrite - Does this instruction write some memory? This only returns /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -170,20 +176,20 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - if (TLI && TLI->has(LibFunc::strcpy) && - F->getName() == TLI->getName(LibFunc::strcpy)) { + if (TLI.has(LibFunc::strcpy) && + F->getName() == TLI.getName(LibFunc::strcpy)) { return true; } - if (TLI && TLI->has(LibFunc::strncpy) && - F->getName() == TLI->getName(LibFunc::strncpy)) { + if (TLI.has(LibFunc::strncpy) && + F->getName() == TLI.getName(LibFunc::strncpy)) { return true; } - if (TLI && TLI->has(LibFunc::strcat) && - F->getName() == TLI->getName(LibFunc::strcat)) { + if (TLI.has(LibFunc::strcat) && + F->getName() == TLI.getName(LibFunc::strcat)) { return true; } - if (TLI && TLI->has(LibFunc::strncat) && - F->getName() == TLI->getName(LibFunc::strncat)) { + if (TLI.has(LibFunc::strncat) && + F->getName() == TLI.getName(LibFunc::strncat)) { return true; } } @@ -224,9 +230,9 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// getLocForRead - Return the location read by the specified "hasMemoryWrite" /// instruction if any. -static MemoryLocation getLocForRead(Instruction *Inst, AliasAnalysis &AA) { - assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && - "Unknown instruction case"); +static MemoryLocation getLocForRead(Instruction *Inst, + const TargetLibraryInfo &TLI) { + assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). @@ -313,9 +319,9 @@ static Value *getStoredPointerOperand(Instruction *I) { } static uint64_t getPointerSize(const Value *V, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo &TLI) { uint64_t Size; - if (getObjectSize(V, Size, DL, TLI)) + if (getObjectSize(V, Size, DL, &TLI)) return Size; return MemoryLocation::UnknownSize; } @@ -336,7 +342,7 @@ namespace { static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, - const TargetLibraryInfo *TLI, + const TargetLibraryInfo &TLI, int64_t &EarlierOff, int64_t &LaterOff) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -442,10 +448,12 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, /// because the DSE inducing instruction may be a self-read. static bool isPossibleSelfRead(Instruction *Inst, const MemoryLocation &InstStoreLoc, - Instruction *DepWrite, AliasAnalysis &AA) { + Instruction *DepWrite, + const TargetLibraryInfo &TLI, + AliasAnalysis &AA) { // Self reads can only happen for instructions that read memory. Get the // location read. - MemoryLocation InstReadLoc = getLocForRead(Inst, AA); + MemoryLocation InstReadLoc = getLocForRead(Inst, TLI); if (!InstReadLoc.Ptr) return false; // Not a reading instruction. // If the read and written loc obviously don't alias, it isn't a read. @@ -459,7 +467,7 @@ static bool isPossibleSelfRead(Instruction *Inst, // Here we don't know if A/B may alias, but we do know that B/B are must // aliases, so removing the first memcpy is safe (assuming it writes <= # // bytes as the second one. - MemoryLocation DepReadLoc = getLocForRead(DepWrite, AA); + MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI); if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) return false; @@ -475,11 +483,12 @@ static bool isPossibleSelfRead(Instruction *Inst, //===----------------------------------------------------------------------===// bool DSE::runOnBasicBlock(BasicBlock &BB) { + const DataLayout &DL = BB.getModule()->getDataLayout(); bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { - Instruction *Inst = BBI++; + Instruction *Inst = &*BBI++; // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst, TLI)) { @@ -488,42 +497,68 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { } // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst, TLI)) - continue; - - MemDepResult InstDep = MD->getDependency(Inst); - - // Ignore any store where we can't find a local dependence. - // FIXME: cross-block DSE would be fun. :) - if (!InstDep.isDef() && !InstDep.isClobber()) + if (!hasMemoryWrite(Inst, *TLI)) continue; // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { + + auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) { + // DeleteDeadInstruction can delete the current instruction. Save BBI + // in case we need it. + WeakVH NextInst(&*BBI); + + DeleteDeadInstruction(DeadInst, *MD, *TLI); + + if (!NextInst) // Next instruction deleted. + BBI = BB.begin(); + else if (BBI != BB.begin()) // Revisit this instruction if possible. + --BBI; + ++NumRedundantStores; + MadeChange = true; + }; + + if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - SI->getOperand(0) == DepLoad && isRemovable(SI)) { + isRemovable(SI) && + MemoryIsNotModifiedBetween(DepLoad, SI)) { + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - // DeleteDeadInstruction can delete the current instruction. Save BBI - // in case we need it. - WeakVH NextInst(BBI); + RemoveDeadInstAndUpdateBBI(SI); + continue; + } + } - DeleteDeadInstruction(SI, *MD, TLI); + // Remove null stores into the calloc'ed objects + Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); - if (!NextInst) // Next instruction deleted. - BBI = BB.begin(); - else if (BBI != BB.begin()) // Revisit this instruction if possible. - --BBI; - ++NumFastStores; - MadeChange = true; + if (StoredConstant && StoredConstant->isNullValue() && + isRemovable(SI)) { + Instruction *UnderlyingPointer = dyn_cast<Instruction>( + GetUnderlyingObject(SI->getPointerOperand(), DL)); + + if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && + MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) { + DEBUG(dbgs() + << "DSE: Remove null store to the calloc'ed object:\n DEAD: " + << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); + + RemoveDeadInstAndUpdateBBI(SI); continue; } } } + MemDepResult InstDep = MD->getDependency(Inst); + + // Ignore any store where we can't find a local dependence. + // FIXME: cross-block DSE would be fun. :) + if (!InstDep.isDef() && !InstDep.isClobber()) + continue; + // Figure out what location is being stored to. MemoryLocation Loc = getLocForWrite(Inst, *AA); @@ -549,24 +584,22 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. if (isRemovable(DepWrite) && - !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { + !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - const DataLayout &DL = BB.getModule()->getDataLayout(); OverwriteResult OR = - isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(), - DepWriteOffset, InstWriteOffset); + isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD, TLI); + DeleteDeadInstruction(DepWrite, *MD, *TLI); ++NumFastStores; MadeChange = true; // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. - BBI = Inst; + BBI = Inst->getIterator(); if (BBI != BB.begin()) --BBI; break; @@ -609,10 +642,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. - if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) + if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) break; - InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); + InstDep = MD->getPointerDependencyFrom(Loc, false, + DepWrite->getIterator(), &BB); } } @@ -624,6 +658,64 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { return MadeChange; } +/// Returns true if the memory which is accessed by the second instruction is not +/// modified between the first and the second instruction. +/// Precondition: Second instruction must be dominated by the first +/// instruction. +bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI, + Instruction *SecondI) { + SmallVector<BasicBlock *, 16> WorkList; + SmallPtrSet<BasicBlock *, 8> Visited; + BasicBlock::iterator FirstBBI(FirstI); + ++FirstBBI; + BasicBlock::iterator SecondBBI(SecondI); + BasicBlock *FirstBB = FirstI->getParent(); + BasicBlock *SecondBB = SecondI->getParent(); + MemoryLocation MemLoc = MemoryLocation::get(SecondI); + + // Start checking the store-block. + WorkList.push_back(SecondBB); + bool isFirstBlock = true; + + // Check all blocks going backward until we reach the load-block. + while (!WorkList.empty()) { + BasicBlock *B = WorkList.pop_back_val(); + + // Ignore instructions before LI if this is the FirstBB. + BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin()); + + BasicBlock::iterator EI; + if (isFirstBlock) { + // Ignore instructions after SI if this is the first visit of SecondBB. + assert(B == SecondBB && "first block is not the store block"); + EI = SecondBBI; + isFirstBlock = false; + } else { + // It's not SecondBB or (in case of a loop) the second visit of SecondBB. + // In this case we also have to look at instructions after SI. + EI = B->end(); + } + for (; BI != EI; ++BI) { + Instruction *I = &*BI; + if (I->mayWriteToMemory() && I != SecondI) { + auto Res = AA->getModRefInfo(I, MemLoc); + if (Res != MRI_NoModRef) + return false; + } + } + if (B != FirstBB) { + assert(B != &FirstBB->getParent()->getEntryBlock() && + "Should not hit the entry block because SI must be dominated by LI"); + for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) { + if (!Visited.insert(*PredI).second) + continue; + WorkList.push_back(*PredI); + } + } + } + return true; +} + /// Find all blocks that will unconditionally lead to the block BB and append /// them to F. static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, @@ -655,10 +747,11 @@ bool DSE::HandleFree(CallInst *F) { Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; - MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); + MemDepResult Dep = + MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) + if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = @@ -668,10 +761,10 @@ bool DSE::HandleFree(CallInst *F) { if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - Instruction *Next = std::next(BasicBlock::iterator(Dependency)); + auto Next = ++Dependency->getIterator(); // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD, TLI); + DeleteDeadInstruction(Dependency, *MD, *TLI); ++NumFastStores; MadeChange = true; @@ -704,23 +797,22 @@ bool DSE::handleEndBlock(BasicBlock &BB) { SmallSetVector<Value*, 16> DeadStackObjects; // Find all of the alloca'd pointers in the entry block. - BasicBlock *Entry = BB.getParent()->begin(); - for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { - if (isa<AllocaInst>(I)) - DeadStackObjects.insert(I); + BasicBlock &Entry = BB.getParent()->front(); + for (Instruction &I : Entry) { + if (isa<AllocaInst>(&I)) + DeadStackObjects.insert(&I); // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) - DeadStackObjects.insert(I); + else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true)) + DeadStackObjects.insert(&I); } // Treat byval or inalloca arguments the same, stores to them are dead at the // end of the function. - for (Function::arg_iterator AI = BB.getParent()->arg_begin(), - AE = BB.getParent()->arg_end(); AI != AE; ++AI) - if (AI->hasByValOrInAllocaAttr()) - DeadStackObjects.insert(AI); + for (Argument &AI : BB.getParent()->args()) + if (AI.hasByValOrInAllocaAttr()) + DeadStackObjects.insert(&AI); const DataLayout &DL = BB.getModule()->getDataLayout(); @@ -729,10 +821,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) { --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { + if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; - GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL); + GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -744,7 +836,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } if (AllDead) { - Instruction *Dead = BBI++; + Instruction *Dead = &*BBI++; DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " << *Dead << "\n Objects: "; @@ -757,7 +849,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); + DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -765,9 +857,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } // Remove any dead non-memory-mutating instructions. - if (isInstructionTriviallyDead(BBI, TLI)) { - Instruction *Inst = BBI++; - DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); + if (isInstructionTriviallyDead(&*BBI, TLI)) { + Instruction *Inst = &*BBI++; + DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -776,15 +868,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (isa<AllocaInst>(BBI)) { // Remove allocas from the list of dead stack objects; there can't be // any references before the definition. - DeadStackObjects.remove(BBI); + DeadStackObjects.remove(&*BBI); continue; } - if (auto CS = CallSite(BBI)) { + if (auto CS = CallSite(&*BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. - if (isAllocLikeFn(BBI, TLI)) - DeadStackObjects.remove(BBI); + if (isAllocLikeFn(&*BBI, TLI)) + DeadStackObjects.remove(&*BBI); // If this call does not access memory, it can't be loading any of our // pointers. @@ -795,10 +887,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - AliasAnalysis::ModRefResult A = AA->getModRefInfo( - CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); + ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)); - return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + return A == MRI_ModRef || A == MRI_Ref; }); // If all of the allocas were clobbered by the call then we're not going @@ -864,8 +955,7 @@ void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc, // Remove objects that could alias LoadedLoc. DeadStackObjects.remove_if([&](Value *I) { // See if the loaded location could alias the stack location. - MemoryLocation StackLoc(I, - getPointerSize(I, DL, AA->getTargetLibraryInfo())); + MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI)); return !AA->isNoAlias(StackLoc, LoadedLoc); }); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 029b44c2ea80..7ef062e71ff3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -263,7 +264,6 @@ namespace { /// expected that a later pass of GVN will catch the interesting/hard cases. class EarlyCSE { public: - Function &F; const TargetLibraryInfo &TLI; const TargetTransformInfo &TTI; DominatorTree &DT; @@ -281,20 +281,37 @@ public: /// that dominated values can succeed in their lookup. ScopedHTType AvailableValues; - /// \brief A scoped hash table of the current values of loads. + /// A scoped hash table of the current values of previously encounted memory + /// locations. /// - /// This allows us to get efficient access to dominating loads when we have - /// a fully redundant load. In addition to the most recent load, we keep - /// track of a generation count of the read, which is compared against the - /// current generation count. The current generation count is incremented + /// This allows us to get efficient access to dominating loads or stores when + /// we have a fully redundant load. In addition to the most recent load, we + /// keep track of a generation count of the read, which is compared against + /// the current generation count. The current generation count is incremented /// after every possibly writing memory operation, which ensures that we only - /// CSE loads with other loads that have no intervening store. - typedef RecyclingAllocator< - BumpPtrAllocator, - ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> + /// CSE loads with other loads that have no intervening store. Ordering + /// events (such as fences or atomic instructions) increment the generation + /// count as well; essentially, we model these as writes to all possible + /// locations. Note that atomic and/or volatile loads and stores can be + /// present the table; it is the responsibility of the consumer to inspect + /// the atomicity/volatility if needed. + struct LoadValue { + Value *Data; + unsigned Generation; + int MatchingId; + bool IsAtomic; + LoadValue() + : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {} + LoadValue(Value *Data, unsigned Generation, unsigned MatchingId, + bool IsAtomic) + : Data(Data), Generation(Generation), MatchingId(MatchingId), + IsAtomic(IsAtomic) {} + }; + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value *, LoadValue>> LoadMapAllocator; - typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, - DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; + typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, + LoadMapAllocator> LoadHTType; LoadHTType AvailableLoads; /// \brief A scoped hash table of the current values of read-only call @@ -308,10 +325,9 @@ public: unsigned CurrentGeneration; /// \brief Set up the EarlyCSE runner for a particular function. - EarlyCSE(Function &F, const TargetLibraryInfo &TLI, - const TargetTransformInfo &TTI, DominatorTree &DT, - AssumptionCache &AC) - : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} + EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, + DominatorTree &DT, AssumptionCache &AC) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} bool run(); @@ -382,57 +398,91 @@ private: class ParseMemoryInst { public: ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) - : Load(false), Store(false), Vol(false), MayReadFromMemory(false), - MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { - MayReadFromMemory = Inst->mayReadFromMemory(); - MayWriteToMemory = Inst->mayWriteToMemory(); - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - MemIntrinsicInfo Info; - if (!TTI.getTgtMemIntrinsic(II, Info)) - return; - if (Info.NumMemRefs == 1) { - Store = Info.WriteMem; - Load = Info.ReadMem; - MatchingId = Info.MatchingId; - MayReadFromMemory = Info.ReadMem; - MayWriteToMemory = Info.WriteMem; - Vol = Info.Vol; - Ptr = Info.PtrVal; - } - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Load = true; - Vol = !LI->isSimple(); - Ptr = LI->getPointerOperand(); + : IsTargetMemInst(false), Inst(Inst) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) + if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1) + IsTargetMemInst = true; + } + bool isLoad() const { + if (IsTargetMemInst) return Info.ReadMem; + return isa<LoadInst>(Inst); + } + bool isStore() const { + if (IsTargetMemInst) return Info.WriteMem; + return isa<StoreInst>(Inst); + } + bool isAtomic() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return false; + } + return Inst->isAtomic(); + } + bool isUnordered() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return true; + } + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->isUnordered(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + return SI->isUnordered(); + } + // Conservative answer + return !Inst->isAtomic(); + } + + bool isVolatile() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return false; + } + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->isVolatile(); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - Store = true; - Vol = !SI->isSimple(); - Ptr = SI->getPointerOperand(); + return SI->isVolatile(); } + // Conservative answer + return true; } - bool isLoad() { return Load; } - bool isStore() { return Store; } - bool isVolatile() { return Vol; } - bool isMatchingMemLoc(const ParseMemoryInst &Inst) { - return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + + + bool isMatchingMemLoc(const ParseMemoryInst &Inst) const { + return (getPointerOperand() == Inst.getPointerOperand() && + getMatchingId() == Inst.getMatchingId()); } - bool isValid() { return Ptr != nullptr; } - int getMatchingId() { return MatchingId; } - Value *getPtr() { return Ptr; } - bool mayReadFromMemory() { return MayReadFromMemory; } - bool mayWriteToMemory() { return MayWriteToMemory; } + bool isValid() const { return getPointerOperand() != nullptr; } - private: - bool Load; - bool Store; - bool Vol; - bool MayReadFromMemory; - bool MayWriteToMemory; // For regular (non-intrinsic) loads/stores, this is set to -1. For // intrinsic loads/stores, the id is retrieved from the corresponding // field in the MemIntrinsicInfo structure. That field contains // non-negative values only. - int MatchingId; - Value *Ptr; + int getMatchingId() const { + if (IsTargetMemInst) return Info.MatchingId; + return -1; + } + Value *getPointerOperand() const { + if (IsTargetMemInst) return Info.PtrVal; + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + return SI->getPointerOperand(); + } + return nullptr; + } + bool mayReadFromMemory() const { + if (IsTargetMemInst) return Info.ReadMem; + return Inst->mayReadFromMemory(); + } + bool mayWriteToMemory() const { + if (IsTargetMemInst) return Info.WriteMem; + return Inst->mayWriteToMemory(); + } + + private: + bool IsTargetMemInst; + MemIntrinsicInfo Info; + Instruction *Inst; }; bool processNode(DomTreeNode *Node); @@ -497,7 +547,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { @@ -548,24 +598,26 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. if (MemInst.isValid() && MemInst.isLoad()) { - // Ignore volatile loads. - if (MemInst.isVolatile()) { + // (conservatively) we can't peak past the ordering implied by this + // operation, but we can add this load to our set of available values + if (MemInst.isVolatile() || !MemInst.isUnordered()) { LastStore = nullptr; - // Don't CSE across synchronization boundaries. - if (Inst->mayWriteToMemory()) - ++CurrentGeneration; - continue; + ++CurrentGeneration; } // If we have an available version of this load, and if it is the right // generation, replace this instruction. - std::pair<Value *, unsigned> InVal = - AvailableLoads.lookup(MemInst.getPtr()); - if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); + if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration && + InVal.MatchingId == MemInst.getMatchingId() && + // We don't yet handle removing loads with ordering of any kind. + !MemInst.isVolatile() && MemInst.isUnordered() && + // We can't replace an atomic load with one which isn't also atomic. + InVal.IsAtomic >= MemInst.isAtomic()) { + Value *Op = getOrCreateResult(InVal.Data, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst - << " to: " << *InVal.first << '\n'); + << " to: " << *InVal.Data << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); Inst->eraseFromParent(); @@ -576,8 +628,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, remember that we have this instruction. - AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( - Inst, CurrentGeneration)); + AvailableLoads.insert( + MemInst.getPointerOperand(), + LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), + MemInst.isAtomic())); LastStore = nullptr; continue; } @@ -613,6 +667,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // A release fence requires that all stores complete before it, but does + // not prevent the reordering of following loads 'before' the fence. As a + // result, we don't need to consider it as writing to memory and don't need + // to advance the generation. We do need to prevent DSE across the fence, + // but that's handled above. + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + if (FI->getOrdering() == Release) { + assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above"); + continue; + } + + // write back DSE - If we write back the same value we just loaded from + // the same location and haven't passed any intervening writes or ordering + // operations, we can remove the write. The primary benefit is in allowing + // the available load table to remain valid and value forward past where + // the store originally was. + if (MemInst.isValid() && MemInst.isStore()) { + LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); + if (InVal.Data && + InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) && + InVal.Generation == CurrentGeneration && + InVal.MatchingId == MemInst.getMatchingId() && + // We don't yet handle removing stores with ordering of any kind. + !MemInst.isVolatile() && MemInst.isUnordered()) { + assert((!LastStore || + ParseMemoryInst(LastStore, TTI).getPointerOperand() == + MemInst.getPointerOperand()) && + "can't have an intervening store!"); + DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + Inst->eraseFromParent(); + Changed = true; + ++NumDSE; + // We can avoid incrementing the generation count since we were able + // to eliminate this store. + continue; + } + } + // Okay, this isn't something we can CSE at all. Check to see if it is // something that could modify memory. If so, our available memory values // cannot be used so bump the generation count. @@ -622,8 +714,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. + // At the moment, we don't remove ordered stores, but do remove + // unordered atomic stores. There's no special requirement (for + // unordered atomics) about removing atomic stores only in favor of + // other atomic stores since we we're going to execute the non-atomic + // one anyway and the atomic one might never have become visible. if (LastStore) { ParseMemoryInst LastStoreMemInst(LastStore, TTI); + assert(LastStoreMemInst.isUnordered() && + !LastStoreMemInst.isVolatile() && + "Violated invariant"); if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " << *Inst << '\n'); @@ -640,12 +740,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( - Inst, CurrentGeneration)); - - // Remember that this was the last store we saw for DSE. - if (!MemInst.isVolatile()) + AvailableLoads.insert( + MemInst.getPointerOperand(), + LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), + MemInst.isAtomic())); + + // Remember that this was the last unordered store we saw for DSE. We + // don't yet handle DSE on ordered or volatile stores since we don't + // have a good way to model the ordering requirement for following + // passes once the store is removed. We could insert a fence, but + // since fences are slightly stronger than stores in their ordering, + // it's not clear this is a profitable transform. Another option would + // be to merge the ordering with that of the post dominating store. + if (MemInst.isUnordered() && !MemInst.isVolatile()) LastStore = Inst; + else + LastStore = nullptr; } } } @@ -714,7 +824,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, auto &DT = AM->getResult<DominatorTreeAnalysis>(F); auto &AC = AM->getResult<AssumptionAnalysis>(F); - EarlyCSE CSE(F, TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC); if (!CSE.run()) return PreservedAnalyses::all(); @@ -751,7 +861,7 @@ public: auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - EarlyCSE CSE(F, TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC); return CSE.run(); } @@ -761,6 +871,7 @@ public: AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } }; diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 0430c1898c8d..185cdbdda378 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -30,7 +30,7 @@ public: bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); } private: @@ -41,7 +41,7 @@ private: char FlattenCFGPass::ID = 0; INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, false) @@ -59,7 +59,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { - if (FlattenCFG(BBIt++, AA)) { + if (FlattenCFG(&*BBIt++, AA)) { LocalChange = true; } } @@ -69,7 +69,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { } bool FlattenCFGPass::runOnFunction(Function &F) { - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool EverChanged = false; // iterativelyFlattenCFG can make some blocks dead. while (iterativelyFlattenCFG(F, AA)) { diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp index c9314229c38b..7f5d78656b50 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -19,6 +19,8 @@ #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" @@ -41,7 +43,7 @@ using namespace llvm; // integer domain inputs, produce an integer output; fadd, for example. // // If a non-mappable instruction is seen, this entire def-use graph is marked -// as non-transformable. If we see an instruction that converts from the +// as non-transformable. If we see an instruction that converts from the // integer domain to FP domain (uitofp,sitofp), we terminate our walk. /// The largest integer type worth dealing with. @@ -60,6 +62,7 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); } void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots); @@ -82,7 +85,9 @@ namespace { } char Float2Int::ID = 0; -INITIALIZE_PASS(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false) // Given a FCmp predicate, return a matching ICmp predicate if one // exists, otherwise return BAD_ICMP_PREDICATE. @@ -125,7 +130,9 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { // Find the roots - instructions that convert from the FP domain to // integer domain. void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { - for (auto &I : inst_range(F)) { + for (auto &I : instructions(F)) { + if (isa<VectorType>(I.getType())) + continue; switch (I.getOpcode()) { default: break; case Instruction::FPToUI: @@ -133,7 +140,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { Roots.insert(&I); break; case Instruction::FCmp: - if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != + if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != CmpInst::BAD_ICMP_PREDICATE) Roots.insert(&I); break; @@ -176,7 +183,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) { // - walkForwards: Iterate over SeenInsts in reverse order, so we visit // defs before their uses. Calculate the real range info. -// Breadth-first walk of the use-def graph; determine the set of nodes +// Breadth-first walk of the use-def graph; determine the set of nodes // we care about and eagerly determine if some of them are poisonous. void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); @@ -222,14 +229,14 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { seen(I, unknownRange()); break; } - + for (Value *O : I->operands()) { if (Instruction *OI = dyn_cast<Instruction>(O)) { // Unify def-use chains if they interfere. ECs.unionSets(I, OI); - if (SeenInsts.find(I)->second != badRange()) + if (SeenInsts.find(I)->second != badRange()) Worklist.push_back(OI); - } else if (!isa<ConstantFP>(O)) { + } else if (!isa<ConstantFP>(O)) { // Not an instruction or ConstantFP? we can't do anything. seen(I, badRange()); } @@ -240,11 +247,11 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { // Walk forwards down the list of seen instructions, so we visit defs before // uses. void Float2Int::walkForwards() { - for (auto It = SeenInsts.rbegin(), E = SeenInsts.rend(); It != E; ++It) { - if (It->second != unknownRange()) + for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) { + if (It.second != unknownRange()) continue; - Instruction *I = It->first; + Instruction *I = It.first; std::function<ConstantRange(ArrayRef<ConstantRange>)> Op; switch (I->getOpcode()) { // FIXME: Handle select and phi nodes. @@ -299,7 +306,7 @@ void Float2Int::walkForwards() { for (Value *O : I->operands()) { if (Instruction *OI = dyn_cast<Instruction>(O)) { assert(SeenInsts.find(OI) != SeenInsts.end() && - "def not seen before use!"); + "def not seen before use!"); OpRanges.push_back(SeenInsts.find(OI)->second); } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { // Work out if the floating point number can be losslessly represented @@ -314,11 +321,11 @@ void Float2Int::walkForwards() { APFloat F = CF->getValueAPF(); // First, weed out obviously incorrect values. Non-finite numbers - // can't be represented and neither can negative zero, unless + // can't be represented and neither can negative zero, unless // we're in fast math mode. if (!F.isFinite() || (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && - !I->hasNoSignedZeros())) { + !I->hasNoSignedZeros())) { seen(I, badRange()); Abort = true; break; @@ -345,7 +352,7 @@ void Float2Int::walkForwards() { // Reduce the operands' ranges to a single range and return. if (!Abort) - seen(I, Op(OpRanges)); + seen(I, Op(OpRanges)); } } @@ -395,7 +402,7 @@ bool Float2Int::validateAndTransform() { R.isFullSet() || R.isSignWrappedSet()) continue; assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); - + // The number of bits required is the maximum of the upper and // lower limits, plus one so it can be signed. unsigned MinBW = std::max(R.getLower().getMinSignedBits(), @@ -505,9 +512,8 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) { // Perform dead code elimination on the instructions we just modified. void Float2Int::cleanup() { - for (auto I = ConvertedInsts.rbegin(), E = ConvertedInsts.rend(); - I != E; ++I) - I->first->eraseFromParent(); + for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend())) + I.first->eraseFromParent(); } bool Float2Int::runOnFunction(Function &F) { @@ -534,7 +540,4 @@ bool Float2Int::runOnFunction(Function &F) { return Modified; } -FunctionPass *llvm::createFloat2IntPass() { - return new Float2Int(); -} - +FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 89a0d0af93be..a028b8c444ba 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -128,6 +129,7 @@ namespace { uint32_t lookup(Value *V) const; uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS, Value *RHS); + bool exists(Value *V) const; void add(Value *V, uint32_t num); void clear(); void erase(Value *v); @@ -388,6 +390,9 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { } } +/// Returns true if a value number exists for the specified value. +bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } + /// lookup_or_add - Returns the value number for the specified value, assigning /// it a new number if it did not have one before. uint32_t ValueTable::lookup_or_add(Value *V) { @@ -608,6 +613,10 @@ namespace { DenseMap<uint32_t, LeaderTableEntry> LeaderTable; BumpPtrAllocator TableAllocator; + // Block-local map of equivalent values to their leader, does not + // propagate to any successors. Entries added mid-block are applied + // to the remaining instructions in the block. + SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap; SmallVector<Instruction*, 8> InstrsToErase; typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; @@ -689,16 +698,17 @@ namespace { AU.addRequired<TargetLibraryInfoWrapperPass>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } - // Helper fuctions of redundant load elimination + // Helper functions of redundant load elimination bool processLoad(LoadInst *L); bool processNonLocalLoad(LoadInst *L); + bool processAssumeIntrinsic(IntrinsicInst *II); void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); @@ -719,7 +729,9 @@ namespace { void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); - bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + bool replaceOperandsWithConsts(Instruction *I) const; + bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, + bool DominatesByEdge); bool processFoldableCondBr(BranchInst *BI); void addDeadBlock(BasicBlock *BB); void assignValNumForDeadCode(); @@ -738,7 +750,8 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1290,8 +1303,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { - const AvailableValueInBlock &AV = ValuesPerBlock[i]; + for (const AvailableValueInBlock &AV : ValuesPerBlock) { BasicBlock *BB = AV.BB; if (SSAUpdate.HasValueForBlock(BB)) @@ -1301,24 +1313,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, } // Perform PHI construction. - Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); - - // If new PHI nodes were created, notify alias analysis. - if (V->getType()->getScalarType()->isPointerTy()) { - AliasAnalysis *AA = gvn.getAliasAnalysis(); - - // Scan the new PHIs and inform alias analysis that we've added potentially - // escaping uses to any values that are operands to these PHIs. - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { - PHINode *P = NewPHIs[i]; - for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - AA->addEscapingUse(P->getOperandUse(jj)); - } - } - } - - return V; + return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); } Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, @@ -1518,9 +1513,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // that we only have to insert *one* load (which means we're basically moving // the load, not inserting a new one). - SmallPtrSet<BasicBlock *, 4> Blockers; - for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) - Blockers.insert(UnavailableBlocks[i]); + SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(), + UnavailableBlocks.end()); // Let's find the first basic block with more than one predecessor. Walk // backwards through predecessors if needed. @@ -1550,15 +1544,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // available. MapVector<BasicBlock *, Value *> PredLoads; DenseMap<BasicBlock*, char> FullyAvailableBlocks; - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) - FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; - for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) - FullyAvailableBlocks[UnavailableBlocks[i]] = false; + for (const AvailableValueInBlock &AV : ValuesPerBlock) + FullyAvailableBlocks[AV.BB] = true; + for (BasicBlock *UnavailableBB : UnavailableBlocks) + FullyAvailableBlocks[UnavailableBB] = false; SmallVector<BasicBlock *, 4> CriticalEdgePred; - for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); - PI != E; ++PI) { - BasicBlock *Pred = *PI; + for (BasicBlock *Pred : predecessors(LoadBB)) { + // If any predecessor block is an EH pad that does not allow non-PHI + // instructions before the terminator, we can't PRE the load. + if (Pred->getTerminator()->isEHPad()) { + DEBUG(dbgs() + << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } @@ -1570,9 +1571,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - if (LoadBB->isLandingPad()) { + if (LoadBB->isEHPad()) { DEBUG(dbgs() - << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" + << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" << Pred->getName() << "': " << *LI << '\n'); return false; } @@ -1655,12 +1656,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, << *NewInsts.back() << '\n'); // Assign value numbers to the new instructions. - for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { + for (Instruction *I : NewInsts) { // FIXME: We really _ought_ to insert these value numbers into their // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be // marking a value as AVAIL-IN, which isn't what we intend. - VN.lookup_or_add(NewInsts[i]); + VN.lookup_or_add(I); } for (const auto &PredLoad : PredLoads) { @@ -1677,6 +1678,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (Tags) NewLoad->setAAMetadata(Tags); + if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load)) + NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD); + if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group)) + NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD); + // Transfer DebugLoc. NewLoad->setDebugLoc(LI->getDebugLoc()); @@ -1704,6 +1710,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, /// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { + // non-local speculations are not allowed under asan. + if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress)) + return false; + // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; MD->getNonLocalPointerDependency(LI, Deps); @@ -1777,6 +1787,63 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { + assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && + "This function can only be called with llvm.assume intrinsic"); + Value *V = IntrinsicI->getArgOperand(0); + + if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { + if (Cond->isZero()) { + Type *Int8Ty = Type::getInt8Ty(V->getContext()); + // Insert a new store to null instruction before the load to indicate that + // this code is not reachable. FIXME: We could insert unreachable + // instruction directly because we can modify the CFG. + new StoreInst(UndefValue::get(Int8Ty), + Constant::getNullValue(Int8Ty->getPointerTo()), + IntrinsicI); + } + markInstructionForDeletion(IntrinsicI); + return false; + } + + Constant *True = ConstantInt::getTrue(V->getContext()); + bool Changed = false; + + for (BasicBlock *Successor : successors(IntrinsicI->getParent())) { + BasicBlockEdge Edge(IntrinsicI->getParent(), Successor); + + // This property is only true in dominated successors, propagateEquality + // will check dominance for us. + Changed |= propagateEquality(V, True, Edge, false); + } + + // We can replace assume value with true, which covers cases like this: + // call void @llvm.assume(i1 %cmp) + // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true + ReplaceWithConstMap[V] = True; + + // If one of *cmp *eq operand is const, adding it to map will cover this: + // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen + // call void @llvm.assume(i1 %cmp) + // ret float %0 ; will change it to ret float 3.000000e+00 + if (auto *CmpI = dyn_cast<CmpInst>(V)) { + if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || + CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || + (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && + CmpI->getFastMathFlags().noNaNs())) { + Value *CmpLHS = CmpI->getOperand(0); + Value *CmpRHS = CmpI->getOperand(1); + if (isa<Constant>(CmpLHS)) + std::swap(CmpLHS, CmpRHS); + auto *RHSConst = dyn_cast<Constant>(CmpRHS); + + // If only one operand is constant. + if (RHSConst != nullptr && !isa<Constant>(CmpLHS)) + ReplaceWithConstMap[CmpLHS] = RHSConst; + } + } + return Changed; +} static void patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value @@ -1789,7 +1856,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { // FIXME: If both the original and replacement value are part of the // same control-flow region (meaning that the execution of one - // guarentees the executation of the other), then we can combine the + // guarantees the execution of the other), then we can combine the // noalias scopes here and do better than the general conservative // answer used in combineMetadata(). @@ -1797,13 +1864,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { // regions, and so we need a conservative combination of the noalias // scopes. static const unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_load, - }; + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_invariant_group}; combineMetadata(ReplInst, I, KnownIDs); } } @@ -1890,10 +1954,8 @@ bool GVN::processLoad(LoadInst *L) { ++NumGVNLoad; return true; } - } - // If the value isn't available, don't do anything! - if (Dep.isClobber()) { + // If the value isn't available, don't do anything! DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; @@ -2049,11 +2111,31 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } +// Tries to replace instruction with const, using information from +// ReplaceWithConstMap. +bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { + bool Changed = false; + for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { + Value *Operand = Instr->getOperand(OpNum); + auto it = ReplaceWithConstMap.find(Operand); + if (it != ReplaceWithConstMap.end()) { + assert(!isa<Constant>(Operand) && + "Replacing constants with constants is invalid"); + DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second + << " in instruction " << *Instr << '\n'); + Instr->setOperand(OpNum, it->second); + Changed = true; + } + } + return Changed; +} + /// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. -bool GVN::propagateEquality(Value *LHS, Value *RHS, - const BasicBlockEdge &Root) { +/// If DominatesByEdge is false, then it means that it is dominated by Root.End. +bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, + bool DominatesByEdge) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; Worklist.push_back(std::make_pair(LHS, RHS)); bool Changed = false; @@ -2065,11 +2147,13 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, std::pair<Value*, Value*> Item = Worklist.pop_back_val(); LHS = Item.first; RHS = Item.second; - if (LHS == RHS) continue; + if (LHS == RHS) + continue; assert(LHS->getType() == RHS->getType() && "Equality but unequal types!"); // Don't try to propagate equalities between constants. - if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue; + if (isa<Constant>(LHS) && isa<Constant>(RHS)) + continue; // Prefer a constant on the right-hand side, or an Argument if no constants. if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) @@ -2108,7 +2192,11 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // LHS always has at least one use that is not dominated by Root, this will // never do anything if LHS has only one use. if (!LHS->hasOneUse()) { - unsigned NumReplacements = replaceDominatedUsesWith(LHS, RHS, *DT, Root); + unsigned NumReplacements = + DominatesByEdge + ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) + : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd()); + Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2180,7 +2268,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, Value *NotCmp = findLeader(Root.getEnd(), Num); if (NotCmp && isa<Instruction>(NotCmp)) { unsigned NumReplacements = - replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root); + DominatesByEdge + ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) + : replaceDominatedUsesWith(NotCmp, NotVal, *DT, + Root.getEnd()); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2220,6 +2311,10 @@ bool GVN::processInstruction(Instruction *I) { return true; } + if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I)) + if (IntrinsicI->getIntrinsicID() == Intrinsic::assume) + return processAssumeIntrinsic(IntrinsicI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (processLoad(LI)) return true; @@ -2250,11 +2345,11 @@ bool GVN::processInstruction(Instruction *I) { Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext()); BasicBlockEdge TrueE(Parent, TrueSucc); - Changed |= propagateEquality(BranchCond, TrueVal, TrueE); + Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true); Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext()); BasicBlockEdge FalseE(Parent, FalseSucc); - Changed |= propagateEquality(BranchCond, FalseVal, FalseE); + Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true); return Changed; } @@ -2276,7 +2371,7 @@ bool GVN::processInstruction(Instruction *I) { // If there is only a single edge, propagate the case value into it. if (SwitchEdges.lookup(Dst) == 1) { BasicBlockEdge E(Parent, Dst); - Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E); + Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true); } } return Changed; @@ -2284,7 +2379,8 @@ bool GVN::processInstruction(Instruction *I) { // Instructions with void type don't return a value, so there's // no point in trying to find redundancies in them. - if (I->getType()->isVoidTy()) return false; + if (I->getType()->isVoidTy()) + return false; uint32_t NextNum = VN.getNextUnusedValueNumber(); unsigned Num = VN.lookup_or_add(I); @@ -2306,17 +2402,21 @@ bool GVN::processInstruction(Instruction *I) { // Perform fast-path value-number based elimination of values inherited from // dominators. - Value *repl = findLeader(I->getParent(), Num); - if (!repl) { + Value *Repl = findLeader(I->getParent(), Num); + if (!Repl) { // Failure, just remember this instance for future use. addToLeaderTable(Num, I, I->getParent()); return false; + } else if (Repl == I) { + // If I was the result of a shortcut PRE, it might already be in the table + // and the best replacement for itself. Nothing to do. + return false; } // Remove it! - patchAndReplaceAllUsesWith(I, repl); - if (MD && repl->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(repl); + patchAndReplaceAllUsesWith(I, Repl); + if (MD && Repl->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(Repl); markInstructionForDeletion(I); return true; } @@ -2331,7 +2431,7 @@ bool GVN::runOnFunction(Function& F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); + VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2341,10 +2441,10 @@ bool GVN::runOnFunction(Function& F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { - BasicBlock *BB = FI++; + BasicBlock *BB = &*FI++; - bool removedBlock = MergeBlockIntoPredecessor( - BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); + bool removedBlock = + MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD); if (removedBlock) ++NumGVNBlocks; Changed |= removedBlock; @@ -2382,7 +2482,6 @@ bool GVN::runOnFunction(Function& F) { return Changed; } - bool GVN::processBlock(BasicBlock *BB) { // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function // (and incrementing BI before processing an instruction). @@ -2391,11 +2490,16 @@ bool GVN::processBlock(BasicBlock *BB) { if (DeadBlocks.count(BB)) return false; + // Clearing map before every BB because it can be used only for single BB. + ReplaceWithConstMap.clear(); bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - ChangedFunction |= processInstruction(BI); + if (!ReplaceWithConstMap.empty()) + ChangedFunction |= replaceOperandsWithConsts(&*BI); + ChangedFunction |= processInstruction(&*BI); + if (InstrsToErase.empty()) { ++BI; continue; @@ -2439,7 +2543,14 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Value *Op = Instr->getOperand(i); if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) continue; - + // This could be a newly inserted instruction, in which case, we won't + // find a value number, and should give up before we hurt ourselves. + // FIXME: Rewrite the infrastructure to let it easier to value number + // and process newly inserted instructions. + if (!VN.exists(Op)) { + success = false; + break; + } if (Value *V = findLeader(Pred, VN.lookup(Op))) { Instr->setOperand(i, V); } else { @@ -2499,9 +2610,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { BasicBlock *CurrentBlock = CurInst->getParent(); predMap.clear(); - for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); - PI != PE; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(CurrentBlock)) { // We're not interested in PRE where the block is its // own predecessor, or in blocks with predecessors // that are not reachable. @@ -2570,7 +2679,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { // Create a PHI to make the value available in this block. PHINode *Phi = PHINode::Create(CurInst->getType(), predMap.size(), - CurInst->getName() + ".pre-phi", CurrentBlock->begin()); + CurInst->getName() + ".pre-phi", &CurrentBlock->front()); for (unsigned i = 0, e = predMap.size(); i != e; ++i) { if (Value *V = predMap[i].first) Phi->addIncoming(V, predMap[i].second); @@ -2582,18 +2691,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->getScalarType()->isPointerTy()) { - // Because we have added a PHI-use of the pointer value, it has now - // "escaped" from alias analysis' perspective. We need to inform - // AA of this. - for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); - } - - if (MD) - MD->invalidateCachedPointerInfo(Phi); - } + if (MD && Phi->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(Phi); VN.erase(CurInst); removeFromLeaderTable(ValNo, CurInst, CurrentBlock); @@ -2616,15 +2715,15 @@ bool GVN::performPRE(Function &F) { if (CurrentBlock == &F.getEntryBlock()) continue; - // Don't perform PRE on a landing pad. - if (CurrentBlock->isLandingPad()) + // Don't perform PRE on an EH pad. + if (CurrentBlock->isEHPad()) continue; for (BasicBlock::iterator BI = CurrentBlock->begin(), BE = CurrentBlock->end(); BI != BE;) { - Instruction *CurInst = BI++; - Changed = performScalarPRE(CurInst); + Instruction *CurInst = &*BI++; + Changed |= performScalarPRE(CurInst); } } @@ -2637,8 +2736,8 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { - BasicBlock *BB = SplitCriticalEdge( - Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); + BasicBlock *BB = + SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT)); if (MD) MD->invalidateCachedPredecessors(); return BB; @@ -2652,7 +2751,7 @@ bool GVN::splitCriticalEdges() { do { std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); SplitCriticalEdge(Edge.first, Edge.second, - CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); + CriticalEdgeSplittingOptions(DT)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); return true; @@ -2728,17 +2827,14 @@ void GVN::addDeadBlock(BasicBlock *BB) { DeadBlocks.insert(Dom.begin(), Dom.end()); // Figure out the dominance-frontier(D). - for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), - E = Dom.end(); I != E; I++) { - BasicBlock *B = *I; - for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { - BasicBlock *S = *SI; + for (BasicBlock *B : Dom) { + for (BasicBlock *S : successors(B)) { if (DeadBlocks.count(S)) continue; bool AllPredDead = true; - for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) - if (!DeadBlocks.count(*PI)) { + for (BasicBlock *P : predecessors(S)) + if (!DeadBlocks.count(P)) { AllPredDead = false; break; } @@ -2766,10 +2862,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { continue; SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); - for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), - PE = Preds.end(); PI != PE; PI++) { - BasicBlock *P = *PI; - + for (BasicBlock *P : Preds) { if (!DeadBlocks.count(P)) continue; @@ -2794,7 +2887,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { // R be the target of the dead out-coming edge. // 1) Identify the set of dead blocks implied by the branch's dead outcoming // edge. The result of this step will be {X| X is dominated by R} -// 2) Identify those blocks which haves at least one dead prodecessor. The +// 2) Identify those blocks which haves at least one dead predecessor. The // result of this step will be dominance-frontier(R). // 3) Update the PHIs in DF(R) by replacing the operands corresponding to // dead blocks with "UndefVal" in an hope these PHIs will optimized away. @@ -2829,14 +2922,10 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { // instructions, it makes more sense just to "fabricate" a val-number for the // dead code than checking if instruction involved is dead or not. void GVN::assignValNumForDeadCode() { - for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), - E = DeadBlocks.end(); I != E; I++) { - BasicBlock *BB = *I; - for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); - II != EE; II++) { - Instruction *Inst = &*II; - unsigned ValNum = VN.lookup_or_add(Inst); - addToLeaderTable(ValNum, Inst, BB); + for (BasicBlock *BB : DeadBlocks) { + for (Instruction &Inst : *BB) { + unsigned ValNum = VN.lookup_or_add(&Inst); + addToLeaderTable(ValNum, &Inst, BB); } } } diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 2a954d9961f2..ec5e15f0b8f8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -28,9 +28,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" @@ -48,6 +50,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" using namespace llvm; @@ -83,64 +86,62 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue( namespace { struct RewritePhi; -} -namespace { - class IndVarSimplify : public LoopPass { - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - - SmallVector<WeakVH, 16> DeadInsts; - bool Changed; - public: - - static char ID; // Pass identification, replacement for typeid - IndVarSimplify() - : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { - initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); - } +class IndVarSimplify : public LoopPass { + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); - AU.addPreservedID(LoopSimplifyID); - AU.addPreservedID(LCSSAID); - AU.setPreservesCFG(); - } + SmallVector<WeakVH, 16> DeadInsts; + bool Changed; +public: - private: - void releaseMemory() override { - DeadInsts.clear(); - } + static char ID; // Pass identification, replacement for typeid + IndVarSimplify() + : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { + initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); + } - bool isValidRewrite(Value *FromVal, Value *ToVal); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.setPreservesCFG(); + } - void HandleFloatingPointIV(Loop *L, PHINode *PH); - void RewriteNonIntegerIVs(Loop *L); +private: + void releaseMemory() override { + DeadInsts.clear(); + } - void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); + bool isValidRewrite(Value *FromVal, Value *ToVal); - bool CanLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); - void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); + void handleFloatingPointIV(Loop *L, PHINode *PH); + void rewriteNonIntegerIVs(Loop *L); - Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, - PHINode *IndVar, SCEVExpander &Rewriter); + void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); - void SinkUnusedInvariants(Loop *L); + bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); + void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); - Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, - Instruction *InsertPt, Type *Ty, - bool &IsHighCostExpansion); - }; + Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, + PHINode *IndVar, SCEVExpander &Rewriter); + + void sinkUnusedInvariants(Loop *L); + + Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, + Instruction *InsertPt, Type *Ty); +}; } char IndVarSimplify::ID = 0; @@ -148,7 +149,7 @@ INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(IndVarSimplify, "indvars", @@ -158,10 +159,10 @@ Pass *llvm::createIndVarSimplifyPass() { return new IndVarSimplify(); } -/// isValidRewrite - Return true if the SCEV expansion generated by the -/// rewriter can replace the original value. SCEV guarantees that it -/// produces the same value, but the way it is produced may be illegal IR. -/// Ideally, this function will only be called for verification. +/// Return true if the SCEV expansion generated by the rewriter can replace the +/// original value. SCEV guarantees that it produces the same value, but the way +/// it is produced may be illegal IR. Ideally, this function will only be +/// called for verification. bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { // If an SCEV expression subsumed multiple pointers, its expansion could // reassociate the GEP changing the base pointer. This is illegal because the @@ -175,10 +176,10 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { // because it understands lcssa phis while SCEV does not. Value *FromPtr = FromVal; Value *ToPtr = ToVal; - if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) { + if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) { FromPtr = GEP->getPointerOperand(); } - if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) { + if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) { ToPtr = GEP->getPointerOperand(); } if (FromPtr != FromVal || ToPtr != ToVal) { @@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { /// loop. For PHI nodes, there may be multiple uses, so compute the nearest /// common dominator for the incoming blocks. static Instruction *getInsertPointForUses(Instruction *User, Value *Def, - DominatorTree *DT) { + DominatorTree *DT, LoopInfo *LI) { PHINode *PHI = dyn_cast<PHINode>(User); if (!PHI) return User; @@ -234,17 +235,28 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, InsertPt = InsertBB->getTerminator(); } assert(InsertPt && "Missing phi operand"); - assert((!isa<Instruction>(Def) || - DT->dominates(cast<Instruction>(Def), InsertPt)) && - "def does not dominate all uses"); - return InsertPt; + + auto *DefI = dyn_cast<Instruction>(Def); + if (!DefI) + return InsertPt; + + assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses"); + + auto *L = LI->getLoopFor(DefI->getParent()); + assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent()))); + + for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom()) + if (LI->getLoopFor(DTN->getBlock()) == L) + return DTN->getBlock()->getTerminator(); + + llvm_unreachable("DefI dominates InsertPt!"); } //===----------------------------------------------------------------------===// -// RewriteNonIntegerIVs and helpers. Prefer integer IVs. +// rewriteNonIntegerIVs and helpers. Prefer integer IVs. //===----------------------------------------------------------------------===// -/// ConvertToSInt - Convert APF to an integer, if possible. +/// Convert APF to an integer, if possible. static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; // See if we can convert this to an int64_t @@ -256,8 +268,8 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { return true; } -/// HandleFloatingPointIV - If the loop has floating induction variable -/// then insert corresponding integer induction variable if possible. +/// If the loop has floating induction variable then insert corresponding +/// integer induction variable if possible. /// For example, /// for(double i = 0; i < 10000; ++i) /// bar(i) @@ -265,13 +277,12 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { /// for(int i = 0; i < 10000; ++i) /// bar((double)i); /// -void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { +void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); unsigned BackEdge = IncomingEdge^1; // Check incoming value. - ConstantFP *InitValueVal = - dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); + auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); int64_t InitValue; if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) @@ -279,8 +290,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Check IV increment. Reject this PN if increment operation is not // an add or increment value can not be represented by an integer. - BinaryOperator *Incr = - dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); + auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return; // If this is not an add of the PHI with a constantfp, or if the constant fp @@ -456,14 +466,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // platforms. if (WeakPH) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", - PN->getParent()->getFirstInsertionPt()); + &*PN->getParent()->getFirstInsertionPt()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); } Changed = true; } -void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { +void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { // First step. Check to see if there are any floating-point recurrences. // If there are, change them into integer recurrences, permitting analysis by // the SCEV routines. @@ -477,7 +487,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { for (unsigned i = 0, e = PHIs.size(); i != e; ++i) if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) - HandleFloatingPointIV(L, PN); + handleFloatingPointIV(L, PN); // If the loop previously had floating-point IV, ScalarEvolution // may not have been able to compute a trip count. Now that we've done some @@ -488,7 +498,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { namespace { // Collect information about PHI nodes which can be transformed in -// RewriteLoopExitValues. +// rewriteLoopExitValues. struct RewritePhi { PHINode *PN; unsigned Ith; // Ith incoming value. @@ -501,70 +511,37 @@ struct RewritePhi { }; } -Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, +Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, Instruction *InsertPt, - Type *ResultTy, - bool &IsHighCostExpansion) { - using namespace llvm::PatternMatch; - - if (!Rewriter.isHighCostExpansion(S, L)) { - IsHighCostExpansion = false; - return Rewriter.expandCodeFor(S, ResultTy, InsertPt); - } - + Type *ResultTy) { // Before expanding S into an expensive LLVM expression, see if we can use an - // already existing value as the expansion for S. There is potential to make - // this significantly smarter, but this simple heuristic already gets some - // interesting cases. - - SmallVector<BasicBlock *, 4> Latches; - L->getLoopLatches(Latches); - - for (BasicBlock *BB : Latches) { - ICmpInst::Predicate Pred; - Instruction *LHS, *RHS; - BasicBlock *TrueBB, *FalseBB; - - if (!match(BB->getTerminator(), - m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), - TrueBB, FalseBB))) - continue; - - if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) { - IsHighCostExpansion = false; - return LHS; - } - - if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) { - IsHighCostExpansion = false; - return RHS; - } - } + // already existing value as the expansion for S. + if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L)) + if (ExistingValue->getType() == ResultTy) + return ExistingValue; // We didn't find anything, fall back to using SCEVExpander. - assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!"); - IsHighCostExpansion = true; return Rewriter.expandCodeFor(S, ResultTy, InsertPt); } //===----------------------------------------------------------------------===// -// RewriteLoopExitValues - Optimize IV users outside the loop. +// rewriteLoopExitValues - Optimize IV users outside the loop. // As a side effect, reduces the amount of IV processing within the loop. //===----------------------------------------------------------------------===// -/// RewriteLoopExitValues - Check to see if this loop has a computable -/// loop-invariant execution count. If so, this means that we can compute the -/// final value of any expressions that are recurrent in the loop, and -/// substitute the exit values from the loop into any instructions outside of -/// the loop that use the final values of the current expressions. +/// Check to see if this loop has a computable loop-invariant execution count. +/// If so, this means that we can compute the final value of any expressions +/// that are recurrent in the loop, and substitute the exit values from the loop +/// into any instructions outside of the loop that use the final values of the +/// current expressions. /// /// This is mostly redundant with the regular IndVarSimplify activities that /// happen later, except that it's more powerful in some cases, because it's /// able to brute-force evaluate arbitrary instructions as long as they have /// constant operands at the beginning of the loop. -void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { - // Verify the input to the pass in already in LCSSA form. - assert(L->isLCSSAForm(*DT)); +void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { + // Check a pre-condition. + assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!"); SmallVector<BasicBlock*, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -679,9 +656,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { continue; } - bool HighCost = false; - Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, - PN->getType(), HighCost); + bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst); + Value *ExitVal = + expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType()); DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' << " LoopVal = " << *Inst << "\n"); @@ -698,7 +675,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { } } - bool LoopCanBeDel = CanLoopBeDeleted(L, RewritePhiSet); + bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet); // Transformation. for (const RewritePhi &Phi : RewritePhiSet) { @@ -735,10 +712,10 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { Rewriter.clearInsertPoint(); } -/// CanLoopBeDeleted - Check whether it is possible to delete the loop after -/// rewriting exit value. If it is possible, ignore ReplaceExitValue and -/// do rewriting aggressively. -bool IndVarSimplify::CanLoopBeDeleted( +/// Check whether it is possible to delete the loop after rewriting exit +/// value. If it is possible, ignore ReplaceExitValue and do rewriting +/// aggressively. +bool IndVarSimplify::canLoopBeDeleted( Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { BasicBlock *Preheader = L->getLoopPreheader(); @@ -782,14 +759,9 @@ bool IndVarSimplify::CanLoopBeDeleted( ++BI; } - for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); - LI != LE; ++LI) { - for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE; - ++BI) { - if (BI->mayHaveSideEffects()) - return false; - } - } + for (auto *BB : L->blocks()) + if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); })) + return false; return true; } @@ -799,22 +771,19 @@ bool IndVarSimplify::CanLoopBeDeleted( //===----------------------------------------------------------------------===// namespace { - // Collect information about induction variables that are used by sign/zero - // extend operations. This information is recorded by CollectExtend and - // provides the input to WidenIV. - struct WideIVInfo { - PHINode *NarrowIV; - Type *WidestNativeType; // Widest integer type created [sz]ext - bool IsSigned; // Was a sext user seen before a zext? - - WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), - IsSigned(false) {} - }; +// Collect information about induction variables that are used by sign/zero +// extend operations. This information is recorded by CollectExtend and provides +// the input to WidenIV. +struct WideIVInfo { + PHINode *NarrowIV = nullptr; + Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext + bool IsSigned = false; // Was a sext user seen before a zext? +}; } -/// visitCast - Update information about the induction variable that is -/// extended by this sign or zero extend operation. This is used to determine -/// the final width of the IV before actually widening it. +/// Update information about the induction variable that is extended by this +/// sign or zero extend operation. This is used to determine the final width of +/// the IV before actually widening it. static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, const TargetTransformInfo *TTI) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; @@ -855,24 +824,29 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, namespace { -/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the -/// WideIV that computes the same value as the Narrow IV def. This avoids -/// caching Use* pointers. +/// Record a link in the Narrow IV def-use chain along with the WideIV that +/// computes the same value as the Narrow IV def. This avoids caching Use* +/// pointers. struct NarrowIVDefUse { - Instruction *NarrowDef; - Instruction *NarrowUse; - Instruction *WideDef; - - NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {} - - NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): - NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} + Instruction *NarrowDef = nullptr; + Instruction *NarrowUse = nullptr; + Instruction *WideDef = nullptr; + + // True if the narrow def is never negative. Tracking this information lets + // us use a sign extension instead of a zero extension or vice versa, when + // profitable and legal. + bool NeverNegative = false; + + NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD, + bool NeverNegative) + : NarrowDef(ND), NarrowUse(NU), WideDef(WD), + NeverNegative(NeverNegative) {} }; -/// WidenIV - The goal of this transform is to remove sign and zero extends -/// without creating any new induction variables. To do this, it creates a new -/// phi of the wider type and redirects all users, either removing extends or -/// inserting truncs whenever we stop propagating the type. +/// The goal of this transform is to remove sign and zero extends without +/// creating any new induction variables. To do this, it creates a new phi of +/// the wider type and redirects all users, either removing extends or inserting +/// truncs whenever we stop propagating the type. /// class WidenIV { // Parameters @@ -913,32 +887,35 @@ public: assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); } - PHINode *CreateWideIV(SCEVExpander &Rewriter); + PHINode *createWideIV(SCEVExpander &Rewriter); protected: - Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, - Instruction *Use); + Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use); - Instruction *CloneIVUser(NarrowIVDefUse DU); + Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR); + Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR); + Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU); - const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); + const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse); - const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU); - const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, + const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const; - Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); + Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); - bool WidenLoopCompare(NarrowIVDefUse DU); + bool widenLoopCompare(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace -/// isLoopInvariant - Perform a quick domtree based check for loop invariance -/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems -/// gratuitous for this purpose. +/// Perform a quick domtree based check for loop invariance assuming that V is +/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this +/// purpose. static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { Instruction *Inst = dyn_cast<Instruction>(V); if (!Inst) @@ -947,8 +924,8 @@ static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { return DT->properlyDominates(Inst->getParent(), L->getHeader()); } -Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, - Instruction *Use) { +Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType, + bool IsSigned, Instruction *Use) { // Set the debug location and conservative insertion point. IRBuilder<> Builder(Use); // Hoist the insertion point into loop preheaders as far as possible. @@ -961,10 +938,11 @@ Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, Builder.CreateZExt(NarrowOper, WideType); } -/// CloneIVUser - Instantiate a wide operation to replace a narrow -/// operation. This only needs to handle operations that can evaluation to -/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. -Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { +/// Instantiate a wide operation to replace a narrow operation. This only needs +/// to handle operations that can evaluation to SCEVAddRec. It can safely return +/// 0 for any operation we decide not to clone. +Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR) { unsigned Opcode = DU.NarrowUse->getOpcode(); switch (Opcode) { default: @@ -973,40 +951,140 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { case Instruction::Mul: case Instruction::UDiv: case Instruction::Sub: + return cloneArithmeticIVUser(DU, WideAR); + case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: - DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); - - // Replace NarrowDef operands with WideDef. Otherwise, we don't know - // anything about the narrow operand yet so must insert a [sz]ext. It is - // probably loop invariant and will be folded or hoisted. If it actually - // comes from a widened IV, it should be removed during a future call to - // WidenIVUse. - Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); - Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); - - BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); - BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), - LHS, RHS, - NarrowBO->getName()); - IRBuilder<> Builder(DU.NarrowUse); - Builder.Insert(WideBO); - if (const OverflowingBinaryOperator *OBO = - dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { - if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap(); - if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap(); + return cloneBitwiseIVUser(DU); + } +} + +Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n"); + + // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything + // about the narrow operand yet so must insert a [sz]ext. It is probably loop + // invariant and will be folded or hoisted. If it actually comes from a + // widened IV, it should be removed during a future call to widenIVUse. + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + IsSigned, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + IsSigned, NarrowUse); + + auto *NarrowBO = cast<BinaryOperator>(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + return WideBO; +} + +Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + + unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1; + + // We're trying to find X such that + // + // Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X + // + // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef), + // and check using SCEV if any of them are correct. + + // Returns true if extending NonIVNarrowDef according to `SignExt` is a + // correct solution to X. + auto GuessNonIVOperand = [&](bool SignExt) { + const SCEV *WideLHS; + const SCEV *WideRHS; + + auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) { + if (SignExt) + return SE->getSignExtendExpr(S, Ty); + return SE->getZeroExtendExpr(S, Ty); + }; + + if (IVOpIdx == 0) { + WideLHS = SE->getSCEV(WideDef); + const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1)); + WideRHS = GetExtend(NarrowRHS, WideType); + } else { + const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0)); + WideLHS = GetExtend(NarrowLHS, WideType); + WideRHS = SE->getSCEV(WideDef); + } + + // WideUse is "WideDef `op.wide` X" as described in the comment. + const SCEV *WideUse = nullptr; + + switch (NarrowUse->getOpcode()) { + default: + llvm_unreachable("No other possibility!"); + + case Instruction::Add: + WideUse = SE->getAddExpr(WideLHS, WideRHS); + break; + + case Instruction::Mul: + WideUse = SE->getMulExpr(WideLHS, WideRHS); + break; + + case Instruction::UDiv: + WideUse = SE->getUDivExpr(WideLHS, WideRHS); + break; + + case Instruction::Sub: + WideUse = SE->getMinusSCEV(WideLHS, WideRHS); + break; } - return WideBO; + + return WideUse == WideAR; + }; + + bool SignExtend = IsSigned; + if (!GuessNonIVOperand(SignExtend)) { + SignExtend = !SignExtend; + if (!GuessNonIVOperand(SignExtend)) + return nullptr; } + + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + SignExtend, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + SignExtend, NarrowUse); + + auto *NarrowBO = cast<BinaryOperator>(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + return WideBO; } -const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, +const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const { if (OpCode == Instruction::Add) return SE->getAddExpr(LHS, RHS); @@ -1022,7 +1100,7 @@ const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, /// operands. Generate the SCEV value for the widened operation without /// actually modifying the IR yet. If the expression after extending the /// operands is an AddRec for this loop, return it. -const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { +const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) { // Handle the common case of add<nsw/nuw> const unsigned OpCode = DU.NarrowUse->getOpcode(); @@ -1062,19 +1140,18 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { if (ExtendOperIdx == 0) std::swap(lhs, rhs); const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode)); + dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode)); if (!AddRec || AddRec->getLoop() != L) return nullptr; return AddRec; } -/// GetWideRecurrence - Is this instruction potentially interesting for further -/// simplification after widening it's type? In other words, can the -/// extend be safely hoisted out of the loop with SCEV reducing the value to a -/// recurrence on the same loop. If so, return the sign or zero extended -/// recurrence. Otherwise return NULL. -const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { +/// Is this instruction potentially interesting for further simplification after +/// widening it's type? In other words, can the extend be safely hoisted out of +/// the loop with SCEV reducing the value to a recurrence on the same loop. If +/// so, return the sign or zero extended recurrence. Otherwise return NULL. +const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) return nullptr; @@ -1097,10 +1174,11 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { /// This IV user cannot be widen. Replace this use of the original narrow IV /// with a truncation of the new wide IV to isolate and eliminate the narrow IV. -static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { +static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) { DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user " << *DU.NarrowUse << "\n"); - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + IRBuilder<> Builder( + getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); } @@ -1108,13 +1186,27 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { /// If the narrow use is a compare instruction, then widen the compare // (and possibly the other operand). The extend operation is hoisted into the // loop preheader as far as possible. -bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { +bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) { ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse); if (!Cmp) return false; - // Sign of IV user and compare must match. - if (IsSigned != CmpInst::isSigned(Cmp->getPredicate())) + // We can legally widen the comparison in the following two cases: + // + // - The signedness of the IV extension and comparison match + // + // - The narrow IV is always positive (and thus its sign extension is equal + // to its zero extension). For instance, let's say we're zero extending + // %narrow for the following use + // + // icmp slt i32 %narrow, %val ... (A) + // + // and %narrow is always positive. Then + // + // (A) == icmp slt i32 sext(%narrow), sext(%val) + // == icmp slt i32 zext(%narrow), sext(%val) + + if (!(DU.NeverNegative || IsSigned == Cmp->isSigned())) return false; Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); @@ -1123,20 +1215,21 @@ bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { assert (CastWidth <= IVWidth && "Unexpected width while widening compare."); // Widen the compare instruction. - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + IRBuilder<> Builder( + getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); // Widen the other operand of the compare, if necessary. if (CastWidth < IVWidth) { - Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp); + Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp); DU.NarrowUse->replaceUsesOfWith(Op, ExtOp); } return true; } -/// WidenIVUse - Determine whether an individual user of the narrow IV can be -/// widened. If so, return the wide clone of the user. -Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { +/// Determine whether an individual user of the narrow IV can be widened. If so, +/// return the wide clone of the user. +Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Stop traversing the def-use chain at inner-loop phis or post-loop phis. if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) { @@ -1145,13 +1238,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // After SimplifyCFG most loop exit targets have a single predecessor. // Otherwise fall back to a truncate within the loop. if (UsePhi->getNumOperands() != 1) - truncateIVUse(DU, DT); + truncateIVUse(DU, DT, LI); else { PHINode *WidePhi = PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide", UsePhi); WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); - IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt()); + IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt()); Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); UsePhi->replaceAllUsesWith(Trunc); DeadInsts.emplace_back(UsePhi); @@ -1200,20 +1293,20 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { } // Does this user itself evaluate to a recurrence after widening? - const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse); if (!WideAddRec) - WideAddRec = GetExtendedOperandRecurrence(DU); + WideAddRec = getExtendedOperandRecurrence(DU); if (!WideAddRec) { // If use is a loop condition, try to promote the condition instead of // truncating the IV first. - if (WidenLoopCompare(DU)) + if (widenLoopCompare(DU)) return nullptr; // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - truncateIVUse(DU, DT); + truncateIVUse(DU, DT, LI); return nullptr; } // Assume block terminators cannot evaluate to a recurrence. We can't to @@ -1228,7 +1321,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) WideUse = WideInc; else { - WideUse = CloneIVUser(DU); + WideUse = cloneIVUser(DU, WideAddRec); if (!WideUse) return nullptr; } @@ -1248,9 +1341,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { return WideUse; } -/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. +/// Add eligible users of NarrowDef to NarrowIVUsers. /// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { + const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef); + bool NeverNegative = + SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV, + SE->getConstant(NarrowSCEV->getType(), 0)); for (User *U : NarrowDef->users()) { Instruction *NarrowUser = cast<Instruction>(U); @@ -1258,21 +1355,21 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { if (!Widened.insert(NarrowUser).second) continue; - NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); + NarrowIVUsers.push_back( + NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative)); } } -/// CreateWideIV - Process a single induction variable. First use the -/// SCEVExpander to create a wide induction variable that evaluates to the same -/// recurrence as the original narrow IV. Then use a worklist to forward -/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all -/// interesting IV users, the narrow IV will be isolated for removal by -/// DeleteDeadPHIs. +/// Process a single induction variable. First use the SCEVExpander to create a +/// wide induction variable that evaluates to the same recurrence as the +/// original narrow IV. Then use a worklist to forward traverse the narrow IV's +/// def-use chain. After widenIVUse has processed all interesting IV users, the +/// narrow IV will be isolated for removal by DeleteDeadPHIs. /// /// It would be simpler to delete uses as they are processed, but we must avoid /// invalidating SCEV expressions. /// -PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { +PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { // Is this phi an induction variable? const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); if (!AddRec) @@ -1302,11 +1399,11 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // either find an existing phi or materialize a new one. Either way, we // expect a well-formed cyclic phi-with-increments. i.e. any operand not part // of the phi-SCC dominates the loop entry. - Instruction *InsertPt = L->getHeader()->begin(); + Instruction *InsertPt = &L->getHeader()->front(); WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt)); // Remembering the WideIV increment generated by SCEVExpander allows - // WidenIVUse to reuse it when widening the narrow IV's increment. We don't + // widenIVUse to reuse it when widening the narrow IV's increment. We don't // employ a general reuse mechanism because the call above is the only call to // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses. if (BasicBlock *LatchBlock = L->getLoopLatch()) { @@ -1329,13 +1426,13 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Process a def-use edge. This may replace the use, so don't hold a // use_iterator across it. - Instruction *WideUse = WidenIVUse(DU, Rewriter); + Instruction *WideUse = widenIVUse(DU, Rewriter); // Follow all def-use edges from the previous narrow use. if (WideUse) pushNarrowIVUsers(DU.NarrowUse, WideUse); - // WidenIVUse may have removed the def-use edge. + // widenIVUse may have removed the def-use edge. if (DU.NarrowDef->use_empty()) DeadInsts.emplace_back(DU.NarrowDef); } @@ -1352,38 +1449,38 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { //===----------------------------------------------------------------------===// namespace { - class IndVarSimplifyVisitor : public IVVisitor { - ScalarEvolution *SE; - const TargetTransformInfo *TTI; - PHINode *IVPhi; - - public: - WideIVInfo WI; - - IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, - const TargetTransformInfo *TTI, - const DominatorTree *DTree) - : SE(SCEV), TTI(TTI), IVPhi(IV) { - DT = DTree; - WI.NarrowIV = IVPhi; - if (ReduceLiveIVs) - setSplitOverflowIntrinsics(); - } +class IndVarSimplifyVisitor : public IVVisitor { + ScalarEvolution *SE; + const TargetTransformInfo *TTI; + PHINode *IVPhi; - // Implement the interface used by simplifyUsersOfIV. - void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } - }; +public: + WideIVInfo WI; + + IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, + const TargetTransformInfo *TTI, + const DominatorTree *DTree) + : SE(SCEV), TTI(TTI), IVPhi(IV) { + DT = DTree; + WI.NarrowIV = IVPhi; + if (ReduceLiveIVs) + setSplitOverflowIntrinsics(); + } + + // Implement the interface used by simplifyUsersOfIV. + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } +}; } -/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV -/// users. Each successive simplification may push more users which may -/// themselves be candidates for simplification. +/// Iteratively perform simplification on a worklist of IV users. Each +/// successive simplification may push more users which may themselves be +/// candidates for simplification. /// /// Sign/Zero extend elimination is interleaved with IV simplification. /// -void IndVarSimplify::SimplifyAndExtend(Loop *L, +void IndVarSimplify::simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, - LPPassManager &LPM) { + LoopInfo *LI) { SmallVector<WideIVInfo, 8> WideIVs; SmallVector<PHINode*, 8> LoopPhis; @@ -1400,14 +1497,14 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, // extension. The first time SCEV attempts to normalize sign/zero extension, // the result becomes final. So for the most predictable results, we delay // evaluation of sign/zero extend evaluation until needed, and avoid running - // other SCEV based analysis prior to SimplifyAndExtend. + // other SCEV based analysis prior to simplifyAndExtend. do { PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); + Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor); if (Visitor.WI.WidestNativeType) { WideIVs.push_back(Visitor.WI); @@ -1416,7 +1513,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, for (; !WideIVs.empty(); WideIVs.pop_back()) { WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); - if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { + if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) { Changed = true; LoopPhis.push_back(WidePhi); } @@ -1425,12 +1522,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, } //===----------------------------------------------------------------------===// -// LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. +// linearFunctionTestReplace and its kin. Rewrite the loop exit condition. //===----------------------------------------------------------------------===// -/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken -/// count expression can be safely and cheaply expanded into an instruction -/// sequence that can be used by LinearFunctionTestReplace. +/// Return true if this loop's backedge taken count expression can be safely and +/// cheaply expanded into an instruction sequence that can be used by +/// linearFunctionTestReplace. /// /// TODO: This fails for pointer-type loop counters with greater than one byte /// strides, consequently preventing LFTR from running. For the purpose of LFTR @@ -1461,8 +1558,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE, return true; } -/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop -/// invariant value to the phi. +/// Return the loop header phi IFF IncV adds a loop invariant value to the phi. static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { Instruction *IncI = dyn_cast<Instruction>(IncV); if (!IncI) @@ -1513,8 +1609,8 @@ static ICmpInst *getLoopTest(Loop *L) { return dyn_cast<ICmpInst>(BI->getCondition()); } -/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show -/// that the current exit test is already sufficiently canonical. +/// linearFunctionTestReplace policy. Return true unless we can show that the +/// current exit test is already sufficiently canonical. static bool needsLFTR(Loop *L, DominatorTree *DT) { // Do LFTR to simplify the exit condition to an ICMP. ICmpInst *Cond = getLoopTest(L); @@ -1574,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, return false; // Optimistically handle other instructions. - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { - if (!Visited.insert(*OI).second) + for (Value *Op : I->operands()) { + if (!Visited.insert(Op).second) continue; - if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) + if (!hasConcreteDefImpl(Op, Visited, Depth+1)) return false; } return true; @@ -1594,8 +1690,8 @@ static bool hasConcreteDef(Value *V) { return hasConcreteDefImpl(V, Visited, 0); } -/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to -/// be rewritten) loop exit test. +/// Return true if this IV has any uses other than the (soon to be rewritten) +/// loop exit test. static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); Value *IncV = Phi->getIncomingValue(LatchIdx); @@ -1608,7 +1704,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { return true; } -/// FindLoopCounter - Find an affine IV in canonical form. +/// Find an affine IV in canonical form. /// /// BECount may be an i8* pointer type. The pointer difference is already /// valid count without scaling the address stride, so it remains a pointer @@ -1702,8 +1798,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, return BestPhi; } -/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that -/// holds the RHS of the new loop test. +/// Help linearFunctionTestReplace by generating a value that holds the RHS of +/// the new loop test. static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, SCEVExpander &Rewriter, ScalarEvolution *SE) { const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); @@ -1785,13 +1881,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, } } -/// LinearFunctionTestReplace - This method rewrites the exit condition of the -/// loop to be a canonical != comparison against the incremented loop induction -/// variable. This pass is able to rewrite the exit tests of any loop where the -/// SCEV analysis can determine a loop-invariant trip count of the loop, which -/// is actually a much broader range than just linear tests. +/// This method rewrites the exit condition of the loop to be a canonical != +/// comparison against the incremented loop induction variable. This pass is +/// able to rewrite the exit tests of any loop where the SCEV analysis can +/// determine a loop-invariant trip count of the loop, which is actually a much +/// broader range than just linear tests. Value *IndVarSimplify:: -LinearFunctionTestReplace(Loop *L, +linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter) { @@ -1809,7 +1905,7 @@ LinearFunctionTestReplace(Loop *L, // This addition may overflow, which is valid as long as the comparison is // truncated to BackedgeTakenCount->getType(). IVCount = SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); + SE->getOne(BackedgeTakenCount->getType())); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. @@ -1847,8 +1943,8 @@ LinearFunctionTestReplace(Loop *L, const SCEV *ARStep = AR->getStepRecurrence(*SE); // For constant IVCount, avoid truncation. if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) { - const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue(); - APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue(); + const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt(); + APInt Count = cast<SCEVConstant>(IVCount)->getAPInt(); // Note that the post-inc value of BackedgeTakenCount may have overflowed // above such that IVCount is now zero. if (IVCount != BackedgeTakenCount && Count == 0) { @@ -1886,21 +1982,21 @@ LinearFunctionTestReplace(Loop *L, } //===----------------------------------------------------------------------===// -// SinkUnusedInvariants. A late subpass to cleanup loop preheaders. +// sinkUnusedInvariants. A late subpass to cleanup loop preheaders. //===----------------------------------------------------------------------===// /// If there's a single exit block, sink any loop-invariant values that /// were defined in the preheader but not used inside the loop into the /// exit block to reduce register pressure in the loop. -void IndVarSimplify::SinkUnusedInvariants(Loop *L) { +void IndVarSimplify::sinkUnusedInvariants(Loop *L) { BasicBlock *ExitBlock = L->getExitBlock(); if (!ExitBlock) return; BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return; - Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); - BasicBlock::iterator I = Preheader->getTerminator(); + Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt(); + BasicBlock::iterator I(Preheader->getTerminator()); while (I != Preheader->begin()) { --I; // New instructions were inserted at the end of the preheader. @@ -1920,8 +2016,8 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { if (isa<DbgInfoIntrinsic>(I)) continue; - // Skip landingpad instructions. - if (isa<LandingPadInst>(I)) + // Skip eh pad instructions. + if (I->isEHPad()) continue; // Don't sink alloca: we never want to sink static alloca's out of the @@ -1953,7 +2049,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { continue; // Otherwise, sink it to the exit block. - Instruction *ToMove = I; + Instruction *ToMove = &*I; bool Done = false; if (I != Preheader->begin()) { @@ -1994,7 +2090,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return false; LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; @@ -2007,7 +2103,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If there are any floating-point recurrences, attempt to // transform them to use integer recurrences. - RewriteNonIntegerIVs(L); + rewriteNonIntegerIVs(L); const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); @@ -2024,7 +2120,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // other expressions involving loop IVs have been evaluated. This helps SCEV // set no-wrap flags before normalizing sign/zero extension. Rewriter.disableCanonicalMode(); - SimplifyAndExtend(L, Rewriter, LPM); + simplifyAndExtend(L, Rewriter, LI); // Check to see if this loop has a computable loop-invariant execution count. // If so, this means that we can compute the final value of any expressions @@ -2034,7 +2130,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // if (ReplaceExitValue != NeverRepl && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) - RewriteLoopExitValues(L, Rewriter); + rewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV cycles. NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); @@ -2054,7 +2150,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // explicitly check any assumptions made by SCEV. Brittle. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount); if (!AR || AR->getLoop()->getLoopPreheader()) - (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter); } } @@ -2074,13 +2170,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop-invariant instructions in the preheader that aren't used in the // loop may be sunk below the loop to reduce register pressure. - SinkUnusedInvariants(L); + sinkUnusedInvariants(L); // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader(), TLI); + // Check a post-condition. - assert(L->isLCSSAForm(*DT) && - "Indvars did not leave the loop in lcssa form!"); + assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!"); // Verify that LFTR, and any other change have not interfered with SCEV's // ability to compute trip count. diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index cbdacad8f28b..dea61f6ff3d7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -214,8 +214,8 @@ public: AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired<ScalarEvolution>(); - AU.addRequired<BranchProbabilityInfo>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<BranchProbabilityInfoWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -224,8 +224,15 @@ public: char InductiveRangeCheckElimination::ID = 0; } -INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", - "Inductive range check elimination", false, false) +INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) const char *InductiveRangeCheck::rangeCheckKindToStr( InductiveRangeCheck::RangeCheckKind RCK) { @@ -1044,9 +1051,9 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( auto BBInsertLocation = std::next(Function::iterator(LS.Latch)); RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", - &F, BBInsertLocation); + &F, &*BBInsertLocation); RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, - BBInsertLocation); + &*BBInsertLocation); BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); bool Increasing = LS.IndVarIncreasing; @@ -1399,8 +1406,9 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { LLVMContext &Context = Preheader->getContext(); InductiveRangeCheck::AllocatorTy IRCAlloc; SmallVector<InductiveRangeCheck *, 16> RangeChecks; - ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); - BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + BranchProbabilityInfo &BPI = + getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); for (auto BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 1130d228acb8..087ce8ac50d4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -18,15 +18,22 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -36,6 +43,8 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <algorithm> +#include <memory> using namespace llvm; #define DEBUG_TYPE "jump-threading" @@ -49,6 +58,13 @@ BBDuplicateThreshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); +static cl::opt<unsigned> +ImplicationSearchThreshold( + "jump-threading-implication-search-threshold", + cl::desc("The number of predecessors to search for a stronger " + "condition to use to thread over a weaker condition"), + cl::init(3), cl::Hidden); + namespace { // These are at global scope so static functions can use them too. typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; @@ -80,6 +96,9 @@ namespace { class JumpThreading : public FunctionPass { TargetLibraryInfo *TLI; LazyValueInfo *LVI; + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + bool HasProfileData; #ifdef NDEBUG SmallPtrSet<BasicBlock*, 16> LoopHeaders; #else @@ -114,9 +133,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } + void releaseMemory() override { + BFI.reset(); + BPI.reset(); + } + void FindLoopHeaders(Function &F); bool ProcessBlock(BasicBlock *BB); bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, @@ -134,9 +159,16 @@ namespace { bool ProcessBranchOnPHI(PHINode *PN); bool ProcessBranchOnXOR(BinaryOperator *BO); + bool ProcessImpliedCondition(BasicBlock *BB); bool SimplifyPartiallyRedundantLoad(LoadInst *LI); bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); + + private: + BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, + const char *Suffix); + void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, + BasicBlock *NewBB, BasicBlock *SuccBB); }; } @@ -160,11 +192,21 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); + BFI.reset(); + BPI.reset(); + // When profile data is available, we need to update edge weights after + // successful jump threading, which requires both BPI and BFI being available. + HasProfileData = F.getEntryCount().hasValue(); + if (HasProfileData) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } // Remove unreachable blocks from function as they may result in infinite // loop. We do threading if we found something profitable. Jump threading a // branch can create other opportunities. If these opportunities form a cycle - // i.e. if any jump treading is undoing previous threading in the path, then + // i.e. if any jump threading is undoing previous threading in the path, then // we will loop forever. We take care of this issue by not jump threading for // back edges. This works for normal cases but not for unreachable blocks as // they may have cycle with no back edge. @@ -176,7 +218,7 @@ bool JumpThreading::runOnFunction(Function &F) { do { Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = I; + BasicBlock *BB = &*I; // Thread all of the branches we can over this block. while (ProcessBlock(BB)) Changed = true; @@ -239,11 +281,26 @@ bool JumpThreading::runOnFunction(Function &F) { static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, unsigned Threshold) { /// Ignore PHI nodes, these will be flattened when duplication happens. - BasicBlock::const_iterator I = BB->getFirstNonPHI(); + BasicBlock::const_iterator I(BB->getFirstNonPHI()); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. + unsigned Bonus = 0; + const TerminatorInst *BBTerm = BB->getTerminator(); + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to happen. + if (isa<SwitchInst>(BBTerm)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(BBTerm)) + Bonus = 8; + + // Bump the threshold up so the early exit from the loop doesn't skip the + // terminator-based Size adjustment at the end. + Threshold += Bonus; + // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; @@ -260,6 +317,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) continue; + // Bail out if this instruction gives back a token type, it is not possible + // to duplicate it if it is used outside this BB. + if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) + return ~0U; + // All other instructions count for at least one unit. ++Size; @@ -268,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->cannotDuplicate()) + if (CI->cannotDuplicate() || CI->isConvergent()) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; @@ -279,16 +341,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, } } - // Threading through a switch statement is particularly profitable. If this - // block ends in a switch, decrease its cost to make it more likely to happen. - if (isa<SwitchInst>(I)) - Size = Size > 6 ? Size-6 : 0; - - // The same holds for indirect branches, but slightly more so. - if (isa<IndirectBrInst>(I)) - Size = Size > 8 ? Size-8 : 0; - - return Size; + return Size > Bonus ? Size - Bonus : 0; } /// FindLoopHeaders - We do not want jump threading to turn proper loop @@ -669,7 +722,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - if (SinglePred->getTerminator()->getNumSuccessors() == 1 && + const TerminatorInst *TI = SinglePred->getTerminator(); + if (!TI->isExceptional() && TI->getNumSuccessors() == 1 && SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) @@ -761,7 +815,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons // against a constant at this time. - // TODO: This should be extended to handle switches as well. + // TODO: This should be extended to handle switches as well. BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); if (CondBr && CondConst && CondBr->isConditional()) { @@ -829,9 +883,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); + // Search for a stronger dominating condition that can be used to simplify a + // conditional branch leaving BB. + if (ProcessImpliedCondition(BB)) + return true; + + return false; +} + +bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + Value *Cond = BI->getCondition(); + BasicBlock *CurrentBB = BB; + BasicBlock *CurrentPred = BB->getSinglePredecessor(); + unsigned Iter = 0; + + auto &DL = BB->getModule()->getDataLayout(); + + while (CurrentPred && Iter++ < ImplicationSearchThreshold) { + auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); + if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB) + return false; - // TODO: If we have: "br (X > 0)" and we have a predecessor where we know - // "(X == 4)", thread through this block. + if (isImpliedCondition(PBI->getCondition(), Cond, DL)) { + BI->getSuccessor(1)->removePredecessor(BB); + BranchInst::Create(BI->getSuccessor(0), BI); + BI->eraseFromParent(); + return true; + } + CurrentBB = CurrentPred; + CurrentPred = CurrentBB->getSinglePredecessor(); + } return false; } @@ -850,10 +935,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; - // If the load is defined in a landing pad, it can't be partially redundant, - // because the edges between the invoke and the landing pad cannot have other + // If the load is defined in an EH pad, it can't be partially redundant, + // because the edges between the invoke and the EH pad cannot have other // instructions between them. - if (LoadBB->isLandingPad()) + if (LoadBB->isEHPad()) return false; Value *LoadedPtr = LI->getOperand(0); @@ -866,11 +951,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. - BasicBlock::iterator BBIt = LI; + BasicBlock::iterator BBIt(LI); if (Value *AvailableVal = - FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { - // If the value if the load is locally available within the block, just use + FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) { + // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; @@ -914,7 +999,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); AAMDNodes ThisAATags; - Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, + DefMaxInstsToScan, nullptr, &ThisAATags); if (!PredAvailable) { OneUnavailablePred = PredBB; @@ -968,8 +1054,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { } // Split them out to their own block. - UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); + UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -995,7 +1080,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Create a PHI node at the start of the block for the PRE'd load value. pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", - LoadBB->begin()); + &LoadBB->front()); PN->takeName(LI); PN->setDebugLoc(LI->getDebugLoc()); @@ -1262,7 +1347,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // Into: // BB': // %Y = icmp ne i32 %A, %B - // br i1 %Z, ... + // br i1 %Y, ... PredValueInfoTy XorOpValues; bool isLHS = true; @@ -1387,14 +1472,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1415,6 +1500,13 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BB->getParent(), BB); NewBB->moveAfter(PredBB); + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = + BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); @@ -1425,7 +1517,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, Instruction *New = BI->clone(); New->setName(BI->getName()); NewBB->getInstList().push_back(New); - ValueMapping[BI] = New; + ValueMapping[&*BI] = New; // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) @@ -1438,7 +1530,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. - BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); + BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the @@ -1475,8 +1567,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); + SSAUpdate.AddAvailableValue(BB, &*I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&*I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); @@ -1499,11 +1591,98 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // frequently happens because of phi translation. SimplifyInstructionsInBlock(NewBB, TLI); + // Update the edge weight from BB to SuccBB, which should be less than before. + UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); + // Threaded an edge! ++NumThreads; return true; } +/// Create a new basic block that will be the predecessor of BB and successor of +/// all blocks in Preds. When profile data is availble, update the frequency of +/// this new block. +BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, + ArrayRef<BasicBlock *> Preds, + const char *Suffix) { + // Collect the frequencies of all predecessors of BB, which will be used to + // update the edge weight on BB->SuccBB. + BlockFrequency PredBBFreq(0); + if (HasProfileData) + for (auto Pred : Preds) + PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); + + BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix); + + // Set the block frequency of the newly created PredBB, which is the sum of + // frequencies of Preds. + if (HasProfileData) + BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); + return PredBB; +} + +/// Update the block frequency of BB and branch weight and the metadata on the +/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - +/// Freq(PredBB->BB) / Freq(BB->SuccBB). +void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *NewBB, + BasicBlock *SuccBB) { + if (!HasProfileData) + return; + + assert(BFI && BPI && "BFI & BPI should have been created here"); + + // As the edge from PredBB to BB is deleted, we have to update the block + // frequency of BB. + auto BBOrigFreq = BFI->getBlockFreq(BB); + auto NewBBFreq = BFI->getBlockFreq(NewBB); + auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); + auto BBNewFreq = BBOrigFreq - NewBBFreq; + BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); + + // Collect updated outgoing edges' frequencies from BB and use them to update + // edge probabilities. + SmallVector<uint64_t, 4> BBSuccFreq; + for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + auto SuccFreq = (*I == SuccBB) + ? BB2SuccBBFreq - NewBBFreq + : BBOrigFreq * BPI->getEdgeProbability(BB, *I); + BBSuccFreq.push_back(SuccFreq.getFrequency()); + } + + uint64_t MaxBBSuccFreq = + *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + + SmallVector<BranchProbability, 4> BBSuccProbs; + if (MaxBBSuccFreq == 0) + BBSuccProbs.assign(BBSuccFreq.size(), + {1, static_cast<unsigned>(BBSuccFreq.size())}); + else { + for (uint64_t Freq : BBSuccFreq) + BBSuccProbs.push_back( + BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); + // Normalize edge probabilities so that they sum up to one. + BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), + BBSuccProbs.end()); + } + + // Update edge probabilities in BPI. + for (int I = 0, E = BBSuccProbs.size(); I < E; I++) + BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); + + if (BBSuccProbs.size() >= 2) { + SmallVector<uint32_t, 4> Weights; + for (auto Prob : BBSuccProbs) + Weights.push_back(Prob.getNumerator()); + + auto TI = BB->getTerminator(); + TI->setMetadata( + LLVMContext::MD_prof, + MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); + } +} + /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch /// to BB which contains an i1 PHI node and a conditional branch on that PHI. /// If we can duplicate the contents of BB up into PredBB do so now, this @@ -1530,14 +1709,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1581,12 +1760,12 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, if (Value *IV = SimplifyInstruction(New, BB->getModule()->getDataLayout())) { delete New; - ValueMapping[BI] = IV; + ValueMapping[&*BI] = IV; } else { // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); - PredBB->getInstList().insert(OldPredBranch, New); - ValueMapping[BI] = New; + PredBB->getInstList().insert(OldPredBranch->getIterator(), New); + ValueMapping[&*BI] = New; } } @@ -1628,8 +1807,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); + SSAUpdate.AddAvailableValue(BB, &*I); + SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&*I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 43fc50e588f8..6d70cdc3ade2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -34,10 +34,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -118,9 +121,12 @@ namespace { AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -164,9 +170,12 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) Pass *llvm::createLICMPass() { return new LICM(); } @@ -183,7 +192,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Get our Loop and Alias Analysis information... LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); @@ -264,9 +273,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // FIXME: This is really heavy handed. It would be a bit better to use an // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. - if (Changed) - formLCSSARecursively(*L, *DT, LI, - getAnalysisIfAvailable<ScalarEvolution>()); + if (Changed) { + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr); + } } // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -402,7 +412,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, } /// Computes loop safety information, checks loop body & header -/// for the possiblity of may throw exception. +/// for the possibility of may throw exception. /// void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { assert(CurLoop != nullptr && "CurLoop cant be null"); @@ -410,7 +420,7 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { // Setting default safety values. SafetyInfo->MayThrow = false; SafetyInfo->HeaderMayThrow = false; - // Iterate over header and compute dafety info. + // Iterate over header and compute safety info. for (BasicBlock::iterator I = Header->begin(), E = Header->end(); (I != E) && !SafetyInfo->HeaderMayThrow; ++I) SafetyInfo->HeaderMayThrow |= I->mayThrow(); @@ -445,7 +455,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, // Don't hoist loads which have may-aliased stores in loop. uint64_t Size = 0; if (LI->getType()->isSized()) - Size = AA->getTypeStoreSize(LI->getType()); + Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType()); AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); @@ -457,10 +467,21 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, return false; // Handle simple cases by querying alias analysis. - AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); - if (Behavior == AliasAnalysis::DoesNotAccessMemory) + FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); + if (Behavior == FMRB_DoesNotAccessMemory) return true; if (AliasAnalysis::onlyReadsMemory(Behavior)) { + // A readonly argmemonly function only reads from memory pointed to by + // it's arguments with arbitrary offsets. If we can prove there are no + // writes to this memory in the loop, we can hoist or sink. + if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) { + for (Value *Op : CI->arg_operands()) + if (Op->getType()->isPointerTy() && + pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize, + AAMDNodes(), CurAST)) + return false; + return true; + } // If this call only reads from memory and there are no writes to memory // in the loop, we can hoist or sink the call as appropriate. bool FoundMod = false; @@ -566,7 +587,7 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I, if (!OLoop->contains(&PN)) { PHINode *OpPN = PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), - OInst->getName() + ".lcssa", ExitBlock.begin()); + OInst->getName() + ".lcssa", &ExitBlock.front()); for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); *OI = OpPN; @@ -651,6 +672,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) { // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); + // Metadata can be dependent on the condition we are hoisting above. + // Conservatively strip all metadata on the instruction. + I.dropUnknownNonDebugMetadata(); + if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumHoisted; @@ -730,9 +755,9 @@ namespace { if (!L->contains(BB)) { // We need to create an LCSSA PHI node for the incoming value and // store that. - PHINode *PN = PHINode::Create( - I->getType(), PredCache.size(BB), - I->getName() + ".lcssa", BB->begin()); + PHINode *PN = + PHINode::Create(I->getType(), PredCache.size(BB), + I->getName() + ".lcssa", &BB->front()); for (BasicBlock *Pred : PredCache.get(BB)) PN->addIncoming(I, Pred); return PN; @@ -942,7 +967,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, CurLoop->getUniqueExitBlocks(ExitBlocks); InsertPts.resize(ExitBlocks.size()); for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt(); + InsertPts[i] = &*ExitBlocks[i]->getFirstInsertionPt(); } // We use the SSAUpdater interface to insert phi nodes as required. @@ -973,7 +998,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, return Changed; } -/// Simple Analysis hook. Clone alias set info. +/// Simple analysis hook. Clone alias set info. /// void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp index c19cd19059b2..1648878b0628 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -56,7 +57,7 @@ class LoadCombine : public BasicBlockPass { public: LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { - initializeSROAPass(*PassRegistry::getPassRegistry()); + initializeLoadCombinePass(*PassRegistry::getPassRegistry()); } using llvm::Pass::doInitialization; @@ -223,7 +224,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { if (skipOptnoneFunction(BB)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); IRBuilder<true, TargetFolder> TheBuilder( BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); @@ -262,8 +263,8 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } char LoadCombine::ID = 0; @@ -274,7 +275,8 @@ BasicBlockPass *llvm::createLoadCombinePass() { INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 98b068edf582..bc00ff3f3a42 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" @@ -35,18 +36,19 @@ namespace { } // Possibly eliminate loop L if it is dead. - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); } @@ -64,7 +66,7 @@ INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", @@ -130,7 +132,7 @@ bool LoopDeletion::isLoopDead(Loop *L, /// so could change the halting/non-halting nature of a program. /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. -bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; @@ -169,7 +171,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. - ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; @@ -242,9 +244,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { for (BasicBlock *BB : blocks) loopInfo.removeBlock(BB); - // The last step is to inform the loop pass manager that we've - // eliminated this loop. - LPM.deleteLoopFromQueue(L); + // The last step is to update LoopInfo now that we've eliminated this loop. + loopInfo.updateUnloop(L); Changed = true; ++NumDeleted; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 1b9859b57790..3d3cf3e2890b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <list> @@ -54,6 +55,11 @@ static cl::opt<bool> DistributeNonIfConvertible( "if-convertible by the loop vectorizer"), cl::init(false)); +static cl::opt<unsigned> DistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Distribution")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -164,9 +170,7 @@ public: // Delete the instructions backwards, as it has a reduced likelihood of // having to update as many def-use and use-def chains. - for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { - auto *Inst = *I; - + for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); Inst->eraseFromParent(); @@ -373,7 +377,7 @@ public: /// \brief This performs the main chunk of the work of cloning the loops for /// the partitions. - void cloneLoops(Pass *P) { + void cloneLoops() { BasicBlock *OrigPH = L->getLoopPreheader(); // At this point the predecessor of the preheader is either the memcheck // block or the top part of the original preheader. @@ -547,11 +551,11 @@ public: MemoryInstructionDependences( const SmallVectorImpl<Instruction *> &Instructions, - const SmallVectorImpl<Dependence> &InterestingDependences) { + const SmallVectorImpl<Dependence> &Dependences) { Accesses.append(Instructions.begin(), Instructions.end()); DEBUG(dbgs() << "Backward dependences:\n"); - for (auto &Dep : InterestingDependences) + for (auto &Dep : Dependences) if (Dep.isPossiblyBackward()) { // Note that the designations source and destination follow the program // order, i.e. source is always first. (The direction is given by the @@ -567,25 +571,6 @@ private: AccessesType Accesses; }; -/// \brief Returns the instructions that use values defined in the loop. -static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { - SmallVector<Instruction *, 8> UsedOutside; - - for (auto *Block : L->getBlocks()) - // FIXME: I believe that this could use copy_if if the Inst reference could - // be adapted into a pointer. - for (auto &Inst : *Block) { - auto Users = Inst.users(); - if (std::any_of(Users.begin(), Users.end(), [&](User *U) { - auto *Use = cast<Instruction>(U); - return !L->contains(Use->getParent()); - })) - UsedOutside.push_back(&Inst); - } - - return UsedOutside; -} - /// \brief The pass class. class LoopDistribute : public FunctionPass { public: @@ -597,6 +582,7 @@ public: LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LAA = &getAnalysis<LoopAccessAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Build up a worklist of inner-loops to vectorize. This is necessary as the // act of distributing a loop creates new loops and can invalidate iterators @@ -619,6 +605,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<LoopAccessAnalysis>(); @@ -629,6 +616,45 @@ public: static char ID; private: + /// \brief Filter out checks between pointers from the same partition. + /// + /// \p PtrToPartition contains the partition number for pointers. Partition + /// number -1 means that the pointer is used in multiple partitions. In this + /// case we can't safely omit the check. + SmallVector<RuntimePointerChecking::PointerCheck, 4> + includeOnlyCrossPartitionChecks( + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, + const SmallVectorImpl<int> &PtrToPartition, + const RuntimePointerChecking *RtPtrChecking) { + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); + + return Checks; + } + /// \brief Try to distribute an inner-most loop. bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -655,9 +681,8 @@ private: DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); return false; } - auto *InterestingDependences = - LAI.getDepChecker().getInterestingDependences(); - if (!InterestingDependences || InterestingDependences->empty()) { + auto *Dependences = LAI.getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) { DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); return false; } @@ -685,7 +710,7 @@ private: // NumUnsafeDependencesActive reaches 0. const MemoryDepChecker &DepChecker = LAI.getDepChecker(); MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), - *InterestingDependences); + *Dependences); int NumUnsafeDependencesActive = 0; for (auto &InstDep : MID) { @@ -735,6 +760,13 @@ private: return false; } + // Don't distribute the loop if we need too many SCEV run-time checks. + const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); + if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { + DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + return false; + } + DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. @@ -746,20 +778,25 @@ private: if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) SplitBlock(PH, PH->getTerminator(), DT, LI); - // If we need run-time checks to disambiguate pointers are run-time, version - // the loop now. + // If we need run-time checks, version the loop now. auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); - LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition); - if (LVer.needsRuntimeChecks()) { + const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); + const auto &AllChecks = RtPtrChecking->getChecks(); + auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, + RtPtrChecking); + + if (!Pred.isAlwaysTrue() || !Checks.empty()) { DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition)); - LVer.versionLoop(this); - LVer.addPHINodes(DefsUsedOutside); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LoopVersioning LVer(LAI, L, LI, DT, SE, false); + LVer.setAliasChecks(std::move(Checks)); + LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LVer.versionLoop(DefsUsedOutside); } // Create identical copies of the original loop for each partition and hook // them up sequentially. - Partitions.cloneLoops(this); + Partitions.cloneLoops(); // Now, we remove the instruction from each loop that don't belong to that // partition. @@ -780,6 +817,7 @@ private: LoopInfo *LI; LoopAccessAnalysis *LAA; DominatorTree *DT; + ScalarEvolution *SE; }; } // anonymous namespace @@ -790,6 +828,7 @@ INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a21ca2417ca1..2d577de7c2b8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -31,11 +31,6 @@ // void foo(_Complex float *P) // for (i) { __real__(*P) = 0; __imag__(*P) = 0; } // -// We should enhance this to handle negative strides through memory. -// Alternatively (and perhaps better) we could rely on an earlier pass to force -// forward iteration through memory, which is generally better for cache -// behavior. Negative strides *do* happen for memset/memcpy loops. -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // @@ -44,7 +39,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -67,149 +65,85 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { - class LoopIdiomRecognize; +class LoopIdiomRecognize : public LoopPass { + Loop *CurLoop; + AliasAnalysis *AA; + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + static char ID; + explicit LoopIdiomRecognize() : LoopPass(ID) { + initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + } - /// This class defines some utility functions for loop idiom recognization. - class LIRUtil { - public: - /// Return true iff the block contains nothing but an uncondition branch - /// (aka goto instruction). - static bool isAlmostEmpty(BasicBlock *); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } - static BranchInst *getBranch(BasicBlock *BB) { - return dyn_cast<BranchInst>(BB->getTerminator()); - } +private: + typedef SmallVector<StoreInst *, 8> StoreList; + StoreList StoreRefs; - /// Derive the precondition block (i.e the block that guards the loop - /// preheader) from the given preheader. - static BasicBlock *getPrecondBb(BasicBlock *PreHead); - }; - - /// This class is to recoginize idioms of population-count conducted in - /// a noncountable loop. Currently it only recognizes this pattern: - /// \code - /// while(x) {cnt++; ...; x &= x - 1; ...} - /// \endcode - class NclPopcountRecognize { - LoopIdiomRecognize &LIR; - Loop *CurLoop; - BasicBlock *PreCondBB; - - typedef IRBuilder<> IRBuilderTy; - - public: - explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); - bool recognize(); - - private: - /// Take a glimpse of the loop to see if we need to go ahead recoginizing - /// the idiom. - bool preliminaryScreen(); - - /// Check if the given conditional branch is based on the comparison - /// between a variable and zero, and if the variable is non-zero, the - /// control yields to the loop entry. If the branch matches the behavior, - /// the variable involved in the comparion is returned. This function will - /// be called to see if the precondition and postcondition of the loop - /// are in desirable form. - Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; - - /// Return true iff the idiom is detected in the loop. and 1) \p CntInst - /// is set to the instruction counting the population bit. 2) \p CntPhi - /// is set to the corresponding phi node. 3) \p Var is set to the value - /// whose population bits are being counted. - bool detectIdiom - (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; - - /// Insert ctpop intrinsic function and some obviously dead instructions. - void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); - - /// Create llvm.ctpop.* intrinsic function. - CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); - }; - - class LoopIdiomRecognize : public LoopPass { - Loop *CurLoop; - DominatorTree *DT; - ScalarEvolution *SE; - TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - public: - static char ID; - explicit LoopIdiomRecognize() : LoopPass(ID) { - initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - DT = nullptr; - SE = nullptr; - TLI = nullptr; - TTI = nullptr; - } + /// \name Countable Loop Idiom Handling + /// @{ - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl<BasicBlock*> &ExitBlocks); - - bool processLoopStore(StoreInst *SI, const SCEV *BECount); - bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - - bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, - Value *SplatValue, Instruction *TheStore, - const SCEVAddRecExpr *Ev, - const SCEV *BECount); - bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, - const SCEVAddRecExpr *StoreEv, - const SCEVAddRecExpr *LoadEv, - const SCEV *BECount); - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks); - DominatorTree *getDominatorTree() { - return DT ? DT - : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); - } + void collectStores(BasicBlock *BB); + bool isLegalStore(StoreInst *SI); + bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - ScalarEvolution *getScalarEvolution() { - return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); - } + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *SplatValue, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride); + bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEV *BECount, bool NegStride); - TargetLibraryInfo *getTargetLibraryInfo() { - if (!TLI) - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + /// @} + /// \name Noncountable Loop Idiom Handling + /// @{ - return TLI; - } + bool runOnNoncountableLoop(); - const TargetTransformInfo *getTargetTransformInfo() { - return TTI ? TTI - : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *CurLoop->getHeader()->getParent())); - } + bool recognizePopcount(); + void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, + PHINode *CntPhi, Value *Var); - Loop *getLoop() const { return CurLoop; } + /// @} +}; - private: - bool runOnNoncountableLoop(); - bool runOnCountableLoop(); - }; -} +} // End anonymous namespace. char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", @@ -218,9 +152,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -242,406 +179,64 @@ static void deleteDeadInstruction(Instruction *I, //===----------------------------------------------------------------------===// // -// Implementation of LIRUtil -// -//===----------------------------------------------------------------------===// - -// This function will return true iff the given block contains nothing but goto. -// A typical usage of this function is to check if the preheader function is -// "almost" empty such that generated intrinsic functions can be moved across -// the preheader and be placed at the end of the precondition block without -// the concern of breaking data dependence. -bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { - if (BranchInst *Br = getBranch(BB)) { - return Br->isUnconditional() && Br == BB->begin(); - } - return false; -} - -BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { - if (BasicBlock *BB = PreHead->getSinglePredecessor()) { - BranchInst *Br = getBranch(BB); - return Br && Br->isConditional() ? BB : nullptr; - } - return nullptr; -} - -//===----------------------------------------------------------------------===// -// -// Implementation of NclPopcountRecognize +// Implementation of LoopIdiomRecognize // //===----------------------------------------------------------------------===// -NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): - LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { -} - -bool NclPopcountRecognize::preliminaryScreen() { - const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); - if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) - return false; - - // Counting population are usually conducted by few arithmetic instructions. - // Such instructions can be easilly "absorbed" by vacant slots in a - // non-compact loop. Therefore, recognizing popcount idiom only makes sense - // in a compact loop. - - // Give up if the loop has multiple blocks or multiple backedges. - if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) - return false; - - BasicBlock *LoopBody = *(CurLoop->block_begin()); - if (LoopBody->size() >= 20) { - // The loop is too big, bail out. - return false; - } - - // It should have a preheader containing nothing but a goto instruction. - BasicBlock *PreHead = CurLoop->getLoopPreheader(); - if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) return false; - // It should have a precondition block where the generated popcount instrinsic - // function will be inserted. - PreCondBB = LIRUtil::getPrecondBb(PreHead); - if (!PreCondBB) + CurLoop = L; + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) return false; - return true; -} - -Value *NclPopcountRecognize::matchCondition(BranchInst *Br, - BasicBlock *LoopEntry) const { - if (!Br || !Br->isConditional()) - return nullptr; - - ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); - if (!Cond) - return nullptr; - - ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); - if (!CmpZero || !CmpZero->isZero()) - return nullptr; - - ICmpInst::Predicate Pred = Cond->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || - (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) - return Cond->getOperand(0); - - return nullptr; -} - -bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, - PHINode *&CntPhi, - Value *&Var) const { - // Following code tries to detect this idiom: - // - // if (x0 != 0) - // goto loop-exit // the precondition of the loop - // cnt0 = init-val; - // do { - // x1 = phi (x0, x2); - // cnt1 = phi(cnt0, cnt2); - // - // cnt2 = cnt1 + 1; - // ... - // x2 = x1 & (x1 - 1); - // ... - // } while(x != 0); - // - // loop-exit: - // - - // step 1: Check to see if the look-back branch match this pattern: - // "if (a!=0) goto loop-entry". - BasicBlock *LoopEntry; - Instruction *DefX2, *CountInst; - Value *VarX1, *VarX0; - PHINode *PhiX, *CountPhi; - - DefX2 = CountInst = nullptr; - VarX1 = VarX0 = nullptr; - PhiX = CountPhi = nullptr; - LoopEntry = *(CurLoop->block_begin()); - - // step 1: Check if the loop-back branch is in desirable form. - { - if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) - DefX2 = dyn_cast<Instruction>(T); - else - return false; - } - - // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" - { - if (!DefX2 || DefX2->getOpcode() != Instruction::And) - return false; - - BinaryOperator *SubOneOp; - - if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) - VarX1 = DefX2->getOperand(1); - else { - VarX1 = DefX2->getOperand(0); - SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); - } - if (!SubOneOp) - return false; - - Instruction *SubInst = cast<Instruction>(SubOneOp); - ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); - if (!Dec || - !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || - (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { - return false; - } - } - - // step 3: Check the recurrence of variable X - { - PhiX = dyn_cast<PHINode>(VarX1); - if (!PhiX || - (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { - return false; - } - } - - // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 - { - CountInst = nullptr; - for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), - IterE = LoopEntry->end(); Iter != IterE; Iter++) { - Instruction *Inst = Iter; - if (Inst->getOpcode() != Instruction::Add) - continue; - - ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); - if (!Inc || !Inc->isOne()) - continue; - - PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) - continue; - - // Check if the result of the instruction is live of the loop. - bool LiveOutLoop = false; - for (User *U : Inst->users()) { - if ((cast<Instruction>(U))->getParent() != LoopEntry) { - LiveOutLoop = true; break; - } - } - - if (LiveOutLoop) { - CountInst = Inst; - CountPhi = Phi; - break; - } - } - - if (!CountInst) - return false; - } - - // step 5: check if the precondition is in this form: - // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" - { - BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); - Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); - if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) - return false; - - CntInst = CountInst; - CntPhi = CountPhi; - Var = T; - } - - return true; -} - -void NclPopcountRecognize::transform(Instruction *CntInst, - PHINode *CntPhi, Value *Var) { - - ScalarEvolution *SE = LIR.getScalarEvolution(); - TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); - BasicBlock *PreHead = CurLoop->getLoopPreheader(); - BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); - const DebugLoc DL = CntInst->getDebugLoc(); - - // Assuming before transformation, the loop is following: - // if (x) // the precondition - // do { cnt++; x &= x - 1; } while(x); - - // Step 1: Insert the ctpop instruction at the end of the precondition block - IRBuilderTy Builder(PreCondBr); - Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; - { - PopCnt = createPopcntIntrinsic(Builder, Var, DL); - NewCount = PopCntZext = - Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); - - if (NewCount != PopCnt) - (cast<Instruction>(NewCount))->setDebugLoc(DL); - - // TripCnt is exactly the number of iterations the loop has - TripCnt = NewCount; - - // If the population counter's initial value is not zero, insert Add Inst. - Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); - ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); - if (!InitConst || !InitConst->isZero()) { - NewCount = Builder.CreateAdd(NewCount, CntInitVal); - (cast<Instruction>(NewCount))->setDebugLoc(DL); - } - } - - // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to - // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic - // function would be partial dead code, and downstream passes will drag - // it back from the precondition block to the preheader. - { - ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); - - Value *Opnd0 = PopCntZext; - Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); - if (PreCond->getOperand(0) != Var) - std::swap(Opnd0, Opnd1); - - ICmpInst *NewPreCond = - cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); - PreCondBr->setCondition(NewPreCond); - - RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); - } - - // Step 3: Note that the population count is exactly the trip count of the - // loop in question, which enble us to to convert the loop from noncountable - // loop into a countable one. The benefit is twofold: - // - // - If the loop only counts population, the entire loop become dead after - // the transformation. It is lots easier to prove a countable loop dead - // than to prove a noncountable one. (In some C dialects, a infite loop - // isn't dead even if it computes nothing useful. In general, DCE needs - // to prove a noncountable loop finite before safely delete it.) - // - // - If the loop also performs something else, it remains alive. - // Since it is transformed to countable form, it can be aggressively - // optimized by some optimizations which are in general not applicable - // to a noncountable loop. - // - // After this step, this loop (conceptually) would look like following: - // newcnt = __builtin_ctpop(x); - // t = newcnt; - // if (x) - // do { cnt++; x &= x-1; t--) } while (t > 0); - BasicBlock *Body = *(CurLoop->block_begin()); - { - BranchInst *LbBr = LIRUtil::getBranch(Body); - ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); - Type *Ty = TripCnt->getType(); - - PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); - - Builder.SetInsertPoint(LbCond); - Value *Opnd1 = cast<Value>(TcPhi); - Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); - Instruction *TcDec = - cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); - - TcPhi->addIncoming(TripCnt, PreHead); - TcPhi->addIncoming(TcDec, Body); - - CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? - CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; - LbCond->setPredicate(Pred); - LbCond->setOperand(0, TcDec); - LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); - } - - // Step 4: All the references to the original population counter outside - // the loop are replaced with the NewCount -- the value returned from - // __builtin_ctpop(). - CntInst->replaceUsesOutsideBlock(NewCount, Body); - - // step 5: Forget the "non-computable" trip-count SCEV associated with the - // loop. The loop would otherwise not be deleted even if it becomes empty. - SE->forgetLoop(CurLoop); -} - -CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, - Value *Val, DebugLoc DL) { - Value *Ops[] = { Val }; - Type *Tys[] = { Val->getType() }; - - Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); - Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); - CallInst *CI = IRBuilder.CreateCall(Func, Ops); - CI->setDebugLoc(DL); - - return CI; -} - -/// recognize - detect population count idiom in a non-countable loop. If -/// detected, transform the relevant code to popcount intrinsic function -/// call, and return true; otherwise, return false. -bool NclPopcountRecognize::recognize() { - - if (!LIR.getTargetTransformInfo()) + // Disable loop idiom recognition if the function's name is a common idiom. + StringRef Name = L->getHeader()->getParent()->getName(); + if (Name == "memset" || Name == "memcpy") return false; - LIR.getScalarEvolution(); - - if (!preliminaryScreen()) - return false; + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *CurLoop->getHeader()->getParent()); + DL = &CurLoop->getHeader()->getModule()->getDataLayout(); - Instruction *CntInst; - PHINode *CntPhi; - Value *Val; - if (!detectIdiom(CntInst, CntPhi, Val)) - return false; + if (SE->hasLoopInvariantBackedgeTakenCount(L)) + return runOnCountableLoop(); - transform(CntInst, CntPhi, Val); - return true; + return runOnNoncountableLoop(); } -//===----------------------------------------------------------------------===// -// -// Implementation of LoopIdiomRecognize -// -//===----------------------------------------------------------------------===// - bool LoopIdiomRecognize::runOnCountableLoop() { const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); assert(!isa<SCEVCouldNotCompute>(BECount) && - "runOnCountableLoop() called on a loop without a predictable" - "backedge-taken count"); + "runOnCountableLoop() called on a loop without a predictable" + "backedge-taken count"); // If this loop executes exactly one time, then it should be peeled, not // optimized by this pass. if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) - if (BECst->getValue()->getValue() == 0) + if (BECst->getAPInt() == 0) return false; - // set DT - (void)getDominatorTree(); - - LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - - // set TLI - (void)getTargetLibraryInfo(); - - SmallVector<BasicBlock*, 8> ExitBlocks; + SmallVector<BasicBlock *, 8> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); DEBUG(dbgs() << "loop-idiom Scanning: F[" - << CurLoop->getHeader()->getParent()->getName() - << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + << CurLoop->getHeader()->getParent()->getName() << "] Loop %" + << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; // Scan all the blocks in the loop that are not in subloops. for (auto *BB : CurLoop->getBlocks()) { // Ignore blocks in subloops. - if (LI.getLoopFor(BB) != CurLoop) + if (LI->getLoopFor(BB) != CurLoop) continue; MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); @@ -649,41 +244,109 @@ bool LoopIdiomRecognize::runOnCountableLoop() { return MadeChange; } -bool LoopIdiomRecognize::runOnNoncountableLoop() { - NclPopcountRecognize Popcount(*this); - if (Popcount.recognize()) - return true; +static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { + uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); + assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && + "Don't overflow unsigned."); + return (unsigned)SizeInBits >> 3; +} - return false; +static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { + const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); + return ConstStride->getAPInt().getZExtValue(); } -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast<Constant>(V); + if (!C) + return nullptr; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = DL->getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size - 1))) + return nullptr; + + // Don't care enough about darwin/ppc to implement this. + if (DL->isBigEndian()) + return nullptr; + + // Convert to size in bytes. + Size /= 8; + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) + return nullptr; + + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) + return C; + + // Otherwise, we'll use an array of the constants. + unsigned ArraySize = 16 / Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); +} + +bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) { + // Don't touch volatile stores. + if (!SI->isSimple()) return false; - CurLoop = L; + Value *StoredVal = SI->getValueOperand(); + Value *StorePtr = SI->getPointerOperand(); - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!L->getLoopPreheader()) + // Reject stores that are so large that they overflow an unsigned. + uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; - // Disable loop idiom recognition if the function's name is a common idiom. - StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *StoreEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) return false; - SE = &getAnalysis<ScalarEvolution>(); - if (SE->hasLoopInvariantBackedgeTakenCount(L)) - return runOnCountableLoop(); - return runOnNoncountableLoop(); + // Check to see if we have a constant stride. + if (!isa<SCEVConstant>(StoreEv->getOperand(1))) + return false; + + return true; +} + +void LoopIdiomRecognize::collectStores(BasicBlock *BB) { + StoreRefs.clear(); + for (Instruction &I : *BB) { + StoreInst *SI = dyn_cast<StoreInst>(&I); + if (!SI) + continue; + + // Make sure this is a strided store with a constant stride. + if (!isLegalStore(SI)) + continue; + + // Save the store locations. + StoreRefs.push_back(SI); + } } /// runOnLoopBlock - Process the specified block, which lives in a counted loop /// with the specified backedge count. This block is known to be in the current /// loop and not in any subloops. -bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl<BasicBlock*> &ExitBlocks) { +bool LoopIdiomRecognize::runOnLoopBlock( + BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks) { // We can only promote stores in this block if they are unconditionally // executed in the loop. For a block to be unconditionally executed, it has // to dominate all the exit blocks of the loop. Verify this now. @@ -692,25 +355,18 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, return false; bool MadeChange = false; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Inst = I++; - // Look for store instructions, which may be optimized to memset/memcpy. - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - WeakVH InstPtr(I); - if (!processLoopStore(SI, BECount)) continue; - MadeChange = true; - - // If processing the store invalidated our iterator, start over from the - // top of the block. - if (!InstPtr) - I = BB->begin(); - continue; - } + // Look for store instructions, which may be optimized to memset/memcpy. + collectStores(BB); + for (auto &SI : StoreRefs) + MadeChange |= processLoopStore(SI, BECount); + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *Inst = &*I++; // Look for memset instructions, which may be optimized to a larger memset. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { - WeakVH InstPtr(I); - if (!processLoopMemSet(MSI, BECount)) continue; + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { + WeakVH InstPtr(&*I); + if (!processLoopMemSet(MSI, BECount)) + continue; MadeChange = true; // If processing the memset invalidated our iterator, start over from the @@ -724,71 +380,38 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, return MadeChange; } - /// processLoopStore - See if this store can be promoted to a memset or memcpy. bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - if (!SI->isSimple()) return false; + assert(SI->isSimple() && "Expected only non-volatile stores."); Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); - // Reject stores that are so large that they overflow an unsigned. - auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); - uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType()); - if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) - return false; - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided store. If we have something else, it's a - // random store we can't handle. - const SCEVAddRecExpr *StoreEv = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) - return false; - // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. - unsigned StoreSize = (unsigned)SizeInBits >> 3; - const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - - if (!Stride || StoreSize != Stride->getValue()->getValue()) { - // TODO: Could also handle negative stride here someday, that will require - // the validity check in mayLoopAccessLocation to be updated though. - // Enable this to print exact negative strides. - if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { - dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; - dbgs() << "BB: " << *SI->getParent(); - } - + const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + unsigned Stride = getStoreStride(StoreEv); + unsigned StoreSize = getStoreSizeInBytes(SI, DL); + if (StoreSize != Stride && StoreSize != -Stride) return false; - } + + bool NegStride = StoreSize == -Stride; // See if we can optimize just this store in isolation. if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount)) + StoredVal, SI, StoreEv, BECount, NegStride)) return true; - // If the stored value is a strided load in the same loop with the same stride - // this this may be transformable into a memcpy. This kicks in for stuff like - // for (i) A[i] = B[i]; - if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { - const SCEVAddRecExpr *LoadEv = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); - if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && - StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) - if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) - return true; - } - //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; - - return false; + // Optimize the store into a memcpy, if it feeds an similarly strided load. + return processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, BECount, NegStride); } /// processLoopMemSet - See if this memset can be promoted to a large memset. -bool LoopIdiomRecognize:: -processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, + const SCEV *BECount) { // We can only handle non-volatile memsets with a constant size. - if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) + return false; // If we're not allowed to hack on memset, we fail. if (!TLI->has(LibFunc::memset)) @@ -818,17 +441,16 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { return false; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), MSI->getValue(), - MSI, Ev, BECount); + MSI->getAlignment(), MSI->getValue(), MSI, Ev, + BECount, /*NegStride=*/false); } - /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, - Loop *L, const SCEV *BECount, - unsigned StoreSize, AliasAnalysis &AA, +static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, Instruction *IgnoredStore) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts @@ -838,7 +460,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, // If the loop iterates a fixed number of times, we can refine the access size // to be exactly the size of the memset, which is (BECount+1)*StoreSize if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) - AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; + AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; // TODO: For this to be really effective, we have to dive into the pointer // operand in the store. Store to &A[i] of 100 will always return may alias @@ -849,59 +471,31 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && - (AA.getModRefInfo(I, StoreLoc) & Access)) + if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access)) return true; return false; } -/// getMemSetPatternValue - If a strided store of the specified value is safe to -/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should -/// be passed in. Otherwise, return null. -/// -/// Note that we don't ever attempt to use memset_pattern8 or 4, because these -/// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { - // If the value isn't a constant, we can't promote it to being in a constant - // array. We could theoretically do a store to an alloca or something, but - // that doesn't seem worthwhile. - Constant *C = dyn_cast<Constant>(V); - if (!C) return nullptr; - - // Only handle simple values that are a power of two bytes in size. - uint64_t Size = DL.getTypeSizeInBits(V->getType()); - if (Size == 0 || (Size & 7) || (Size & (Size-1))) - return nullptr; - - // Don't care enough about darwin/ppc to implement this. - if (DL.isBigEndian()) - return nullptr; - - // Convert to size in bytes. - Size /= 8; - - // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see - // if the top and bottom are the same (e.g. for vectors and large integers). - if (Size > 16) return nullptr; - - // If the constant is exactly 16 bytes, just use it. - if (Size == 16) return C; - - // Otherwise, we'll use an array of the constants. - unsigned ArraySize = 16/Size; - ArrayType *AT = ArrayType::get(V->getType(), ArraySize); - return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +// If we have a negative stride, Start refers to the end of the memory location +// we're trying to memset. Therefore, we need to recompute the base pointer, +// which is just Start - BECount*Size. +static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, + Type *IntPtr, unsigned StoreSize, + ScalarEvolution *SE) { + const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); + if (StoreSize != 1) + Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + return SE->getMinusSCEV(Start, Index); } - /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. -bool LoopIdiomRecognize:: -processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *StoredVal, - Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopStridedStore( + Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, + Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride) { // If the stored value is a byte-wise value (like i32 -1), then it may be // turned into a memset of i8 -1, assuming that all the consecutive bytes @@ -909,7 +503,6 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // but it can be turned into memset_pattern if the target supports it. Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; - auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); // If we're allowed to form a memset, and the stored value would be acceptable @@ -936,9 +529,15 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, DL, "loop-idiom"); + SCEVExpander Expander(*SE, *DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + + const SCEV *Start = Ev->getStart(); + // Handle negative strided loops. + if (NegStride) + Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this @@ -946,12 +545,9 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, - Preheader->getTerminator()); - - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { + Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); + if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, + *AA, TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -962,36 +558,30 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), - SCEV::FlagNUW); + const SCEV *NumBytesS = + SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); if (StoreSize != 1) { NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), SCEV::FlagNUW); } Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; if (SplatValue) { - NewCall = Builder.CreateMemSet(BasePtr, - SplatValue, - NumBytes, - StoreAlignment); + NewCall = + Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; - Module *M = TheStore->getParent()->getParent()->getParent(); - Value *MSP = M->getOrInsertFunction("memset_pattern16", - Builder.getVoidTy(), - Int8PtrTy, - Int8PtrTy, - IntPtr, - (void*)nullptr); + Module *M = TheStore->getModule(); + Value *MSP = + M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), + Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1015,26 +605,47 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, return true; } -/// processLoopStoreOfLoopLoad - We see a strided store whose value is a -/// same-strided load. -bool LoopIdiomRecognize:: -processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, - const SCEVAddRecExpr *StoreEv, - const SCEVAddRecExpr *LoadEv, - const SCEV *BECount) { +/// If the stored value is a strided load in the same loop with the same stride +/// this may be transformable into a memcpy. This kicks in for stuff like +/// for (i) A[i] = B[i]; +bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( + StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv, + const SCEV *BECount, bool NegStride) { // If we're not allowed to form memcpy, we fail. if (!TLI->has(LibFunc::memcpy)) return false; - LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + // The store must be feeding a non-volatile load. + LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!LI || !LI->isSimple()) + return false; + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided load. If we have something else, it's a + // random load we can't handle. + const SCEVAddRecExpr *LoadEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); + if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) + return false; + + // The store and load must share the same stride. + if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) + return false; // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - const DataLayout &DL = Preheader->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "loop-idiom"); + SCEVExpander Expander(*SE, *DL, "loop-idiom"); + + const SCEV *StrStart = StoreEv->getStart(); + unsigned StrAS = SI->getPointerAddressSpace(); + Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + + // Handle negative strided loops. + if (NegStride) + StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1042,29 +653,31 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // or write the memory region we're storing to. This includes the load that // feeds the stores. Check for an alias by generating the base address and // checking everything. - Value *StoreBasePtr = - Expander.expandCodeFor(StoreEv->getStart(), - Builder.getInt8PtrTy(SI->getPointerAddressSpace()), - Preheader->getTerminator()); - - if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, StoreSize, - getAnalysis<AliasAnalysis>(), SI)) { + Value *StoreBasePtr = Expander.expandCodeFor( + StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + + if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, + StoreSize, *AA, SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } + const SCEV *LdStart = LoadEv->getStart(); + unsigned LdAS = LI->getPointerAddressSpace(); + + // Handle negative strided loops. + if (NegStride) + LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); + // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. - Value *LoadBasePtr = - Expander.expandCodeFor(LoadEv->getStart(), - Builder.getInt8PtrTy(LI->getPointerAddressSpace()), - Preheader->getTerminator()); + Value *LoadBasePtr = Expander.expandCodeFor( + LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); - if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), SI)) { + if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, + *AA, SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -1074,34 +687,368 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, everything is safe, we can transform this! - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), - SCEV::FlagNUW); + const SCEV *NumBytesS = + SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); if (StoreSize != 1) NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = - Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, - std::min(SI->getAlignment(), LI->getAlignment())); + Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, + std::min(SI->getAlignment(), LI->getAlignment())); NewCall->setDebugLoc(SI->getDebugLoc()); DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - - // Okay, the memset has been formed. Zap the original store and anything that + // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. deleteDeadInstruction(SI, TLI); ++NumMemCpy; return true; } + +bool LoopIdiomRecognize::runOnNoncountableLoop() { + return recognizePopcount(); +} + +/// Check if the given conditional branch is based on the comparison between +/// a variable and zero, and if the variable is non-zero, the control yields to +/// the loop entry. If the branch matches the behavior, the variable involved +/// in the comparion is returned. This function will be called to see if the +/// precondition and postcondition of the loop are in desirable form. +static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { + if (!BI || !BI->isConditional()) + return nullptr; + + ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!Cond) + return nullptr; + + ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); + if (!CmpZero || !CmpZero->isZero()) + return nullptr; + + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) + return Cond->getOperand(0); + + return nullptr; +} + +/// Return true iff the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction counting the population bit. +/// 2) \p CntPhi is set to the corresponding phi node. +/// 3) \p Var is set to the value whose population bits are being counted. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 != 0) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val; +/// do { +/// x1 = phi (x0, x2); +/// cnt1 = phi(cnt0, cnt2); +/// +/// cnt2 = cnt1 + 1; +/// ... +/// x2 = x1 & (x1 - 1); +/// ... +/// } while(x != 0); +/// +/// loop-exit: +/// \endcode +static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, + Instruction *&CntInst, PHINode *&CntPhi, + Value *&Var) { + // step 1: Check to see if the look-back branch match this pattern: + // "if (a!=0) goto loop-entry". + BasicBlock *LoopEntry; + Instruction *DefX2, *CountInst; + Value *VarX1, *VarX0; + PHINode *PhiX, *CountPhi; + + DefX2 = CountInst = nullptr; + VarX1 = VarX0 = nullptr; + PhiX = CountPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + { + if (Value *T = matchCondition( + dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) + DefX2 = dyn_cast<Instruction>(T); + else + return false; + } + + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" + { + if (!DefX2 || DefX2->getOpcode() != Instruction::And) + return false; + + BinaryOperator *SubOneOp; + + if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) + VarX1 = DefX2->getOperand(1); + else { + VarX1 = DefX2->getOperand(0); + SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); + } + if (!SubOneOp) + return false; + + Instruction *SubInst = cast<Instruction>(SubOneOp); + ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); + if (!Dec || + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && + Dec->isAllOnesValue()))) { + return false; + } + } + + // step 3: Check the recurrence of variable X + { + PhiX = dyn_cast<PHINode>(VarX1); + if (!PhiX || + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + return false; + } + } + + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 + { + CountInst = nullptr; + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + // Check if the result of the instruction is live of the loop. + bool LiveOutLoop = false; + for (User *U : Inst->users()) { + if ((cast<Instruction>(U))->getParent() != LoopEntry) { + LiveOutLoop = true; + break; + } + } + + if (LiveOutLoop) { + CountInst = Inst; + CountPhi = Phi; + break; + } + } + + if (!CountInst) + return false; + } + + // step 5: check if the precondition is in this form: + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" + { + auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) + return false; + + CntInst = CountInst; + CntPhi = CountPhi; + Var = T; + } + + return true; +} + +/// Recognizes a population count idiom in a non-countable loop. +/// +/// If detected, transforms the relevant code to issue the popcount intrinsic +/// function call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizePopcount() { + if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) + return false; + + // Counting population are usually conducted by few arithmetic instructions. + // Such instructions can be easily "absorbed" by vacant slots in a + // non-compact loop. Therefore, recognizing popcount idiom only makes sense + // in a compact loop. + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + if (LoopBody->size() >= 20) { + // The loop is too big, bail out. + return false; + } + + // It should have a preheader containing nothing but an unconditional branch. + BasicBlock *PH = CurLoop->getLoopPreheader(); + if (!PH) + return false; + if (&PH->front() != PH->getTerminator()) + return false; + auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); + if (!EntryBI || EntryBI->isConditional()) + return false; + + // It should have a precondition block where the generated popcount instrinsic + // function can be inserted. + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + if (!PreCondBI || PreCondBI->isUnconditional()) + return false; + + Instruction *CntInst; + PHINode *CntPhi; + Value *Val; + if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) + return false; + + transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); + return true; +} + +static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + DebugLoc DL) { + Value *Ops[] = {Val}; + Type *Tys[] = {Val->getType()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, + Instruction *CntInst, + PHINode *CntPhi, Value *Var) { + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + const DebugLoc DL = CntInst->getDebugLoc(); + + // Assuming before transformation, the loop is following: + // if (x) // the precondition + // do { cnt++; x &= x - 1; } while(x); + + // Step 1: Insert the ctpop instruction at the end of the precondition block + IRBuilder<> Builder(PreCondBr); + Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; + { + PopCnt = createPopcntIntrinsic(Builder, Var, DL); + NewCount = PopCntZext = + Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); + + if (NewCount != PopCnt) + (cast<Instruction>(NewCount))->setDebugLoc(DL); + + // TripCnt is exactly the number of iterations the loop has + TripCnt = NewCount; + + // If the population counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) { + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + (cast<Instruction>(NewCount))->setDebugLoc(DL); + } + } + + // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to + // "if (NewCount == 0) loop-exit". Without this change, the intrinsic + // function would be partial dead code, and downstream passes will drag + // it back from the precondition block to the preheader. + { + ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); + + Value *Opnd0 = PopCntZext; + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); + if (PreCond->getOperand(0) != Var) + std::swap(Opnd0, Opnd1); + + ICmpInst *NewPreCond = cast<ICmpInst>( + Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); + PreCondBr->setCondition(NewPreCond); + + RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); + } + + // Step 3: Note that the population count is exactly the trip count of the + // loop in question, which enable us to to convert the loop from noncountable + // loop into a countable one. The benefit is twofold: + // + // - If the loop only counts population, the entire loop becomes dead after + // the transformation. It is a lot easier to prove a countable loop dead + // than to prove a noncountable one. (In some C dialects, an infinite loop + // isn't dead even if it computes nothing useful. In general, DCE needs + // to prove a noncountable loop finite before safely delete it.) + // + // - If the loop also performs something else, it remains alive. + // Since it is transformed to countable form, it can be aggressively + // optimized by some optimizations which are in general not applicable + // to a noncountable loop. + // + // After this step, this loop (conceptually) would look like following: + // newcnt = __builtin_ctpop(x); + // t = newcnt; + // if (x) + // do { cnt++; x &= x-1; t--) } while (t > 0); + BasicBlock *Body = *(CurLoop->block_begin()); + { + auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = TripCnt->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + + Builder.SetInsertPoint(LbCond); + Instruction *TcDec = cast<Instruction>( + Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), + "tcdec", false, true)); + + TcPhi->addIncoming(TripCnt, PreHead); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = + (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + } + + // Step 4: All the references to the original population counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctpop(). + CntInst->replaceUsesOutsideBlock(NewCount, Body); + + // step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index e12502654751..b4102fe9ba34 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -48,7 +48,7 @@ namespace { AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; @@ -112,7 +112,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Simplify instructions in the current basic block. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = BI++; + Instruction *I = &*BI++; // The first time through the loop ToSimplify is empty and we try to // simplify all instructions. On later iterations ToSimplify is not diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9d7e57ffebac..4295235a3f36 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -99,7 +99,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; if (St && !St->isSimple()) return false; - MemInstr.push_back(I); + MemInstr.push_back(&*I); } } @@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, } } - // We don't have a DepMatrix to check legality return false + // We don't have a DepMatrix to check legality return false. if (DepMatrix.size() == 0) return false; return true; @@ -331,9 +331,9 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - LoopInterchange *Pass) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass), - InnerLoopHasReduction(false) {} + LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), + PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -357,9 +357,10 @@ private: Loop *OuterLoop; Loop *InnerLoop; - /// Scev analysis. ScalarEvolution *SE; - LoopInterchange *CurrentPass; + LoopInfo *LI; + DominatorTree *DT; + bool PreserveLCSSA; bool InnerLoopHasReduction; }; @@ -371,7 +372,7 @@ public: LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} - /// Check if the loop interchange is profitable + /// Check if the loop interchange is profitable. bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix); @@ -385,12 +386,12 @@ private: ScalarEvolution *SE; }; -/// LoopInterchangeTransform interchanges the loop +/// LoopInterchangeTransform interchanges the loop. class LoopInterchangeTransform { public: LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, - LoopInterchange *Pass, BasicBlock *LoopNestExit, + BasicBlock *LoopNestExit, bool InnerLoopContainsReductions) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LoopExit(LoopNestExit), @@ -424,21 +425,22 @@ private: bool InnerLoopHasReduction; }; -// Main LoopInterchange Pass +// Main LoopInterchange Pass. struct LoopInterchange : public FunctionPass { static char ID; ScalarEvolution *SE; LoopInfo *LI; DependenceAnalysis *DA; DominatorTree *DT; + bool PreserveLCSSA; LoopInterchange() : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolution>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DependenceAnalysis>(); @@ -447,11 +449,13 @@ struct LoopInterchange : public FunctionPass { } bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DA = &getAnalysis<DependenceAnalysis>(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + // Build up a worklist of loop pairs to analyze. SmallVector<LoopVector, 8> Worklist; @@ -489,7 +493,7 @@ struct LoopInterchange : public FunctionPass { unsigned selectLoopForInterchange(LoopVector LoopList) { // TODO: Add a better heuristic to select the loop to be interchanged based - // on the dependece matrix. Currently we select the innermost loop. + // on the dependence matrix. Currently we select the innermost loop. return LoopList.size() - 1; } @@ -544,7 +548,7 @@ struct LoopInterchange : public FunctionPass { } unsigned SelecLoopId = selectLoopForInterchange(LoopList); - // Move the selected loop outwards to the best posible position. + // Move the selected loop outwards to the best possible position. for (unsigned i = SelecLoopId; i > 0; i--) { bool Interchanged = processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); @@ -574,7 +578,8 @@ struct LoopInterchange : public FunctionPass { Loop *InnerLoop = LoopList[InnerLoopId]; Loop *OuterLoop = LoopList[OuterLoopId]; - LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this); + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, + PreserveLCSSA); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); return false; @@ -586,7 +591,7 @@ struct LoopInterchange : public FunctionPass { return false; } - LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); DEBUG(dbgs() << "Loops interchanged\n"); @@ -655,7 +660,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n"); // We do not have any basic block in between now make sure the outer header - // and outer loop latch doesnt contain any unsafe instructions. + // and outer loop latch doesn't contain any unsafe instructions. if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || containsUnsafeInstructionsInLatch(OuterLoopLatch)) return false; @@ -698,9 +703,9 @@ bool LoopInterchangeLegality::findInductionAndReductions( return false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { RecurrenceDescriptor RD; + InductionDescriptor ID; PHINode *PHI = cast<PHINode>(I); - ConstantInt *StepValue = nullptr; - if (isInductionPHI(PHI, SE, StepValue)) + if (InductionDescriptor::isInductionPHI(PHI, SE, ID)) Inductions.push_back(PHI); else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) Reductions.push_back(PHI); @@ -836,7 +841,7 @@ bool LoopInterchangeLegality::currentLimitations() { else FoundInduction = true; } - // The loop latch ended and we didnt find the induction variable return as + // The loop latch ended and we didn't find the induction variable return as // current limitation. if (!FoundInduction) return true; @@ -867,12 +872,14 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || isa<PHINode>(OuterLoopPreHeader->begin()) || !OuterLoopPreHeader->getUniquePredecessor()) { - OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass); + OuterLoopPreHeader = + InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); } if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || InnerLoopPreHeader == OuterLoop->getHeader()) { - InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass); + InnerLoopPreHeader = + InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); } // TODO: The loops could not be interchanged due to current limitations in the @@ -966,7 +973,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // TODO: Add Better Profitibility checks. + // TODO: Add better profitability checks. // e.g // 1) Construct dependency matrix and move the one with no loop carried dep // inside to enable vectorization. @@ -980,7 +987,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, if (Cost < 0) return true; - // It is not profitable as per current cache profitibility model. But check if + // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. bool ImprovesPar = isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); @@ -996,7 +1003,7 @@ void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, return; } } - assert(false && "Couldn't find loop"); + llvm_unreachable("Couldn't find loop"); } void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, @@ -1045,7 +1052,7 @@ bool LoopInterchangeTransform::transform() { splitInnerLoopLatch(InnerIndexVar); DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); - // Splits the inner loops phi nodes out into a seperate basic block. + // Splits the inner loops phi nodes out into a separate basic block. splitInnerLoopHeader(); DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); } @@ -1113,8 +1120,8 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { auto &ToList = InsertBefore->getParent()->getInstList(); auto &FromList = FromBB->getInstList(); - ToList.splice(InsertBefore, FromList, FromList.begin(), - FromBB->getTerminator()); + ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), + FromBB->getTerminator()->getIterator()); } void LoopInterchangeTransform::adjustOuterLoopPreheader() { @@ -1181,8 +1188,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) return false; - BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); - if (!InnerLoopHeaderSucessor) + BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); + if (!InnerLoopHeaderSuccessor) return false; // Adjust Loop Preheader and headers @@ -1198,11 +1205,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) OuterLoopHeaderBI->setSuccessor(i, LoopExit); else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) - OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); + OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor); } // Adjust reduction PHI's now that the incoming block has changed. - updateIncomingBlock(InnerLoopHeaderSucessor, InnerLoopHeader, + updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, OuterLoopHeader); BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); @@ -1286,10 +1293,10 @@ bool LoopInterchangeTransform::adjustLoopLinks() { char LoopInterchange::ID = 0; INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp new file mode 100644 index 000000000000..1064d088514d --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -0,0 +1,566 @@ +//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implement a loop-aware load elimination pass. +// +// It uses LoopAccessAnalysis to identify loop-carried dependences with a +// distance of one between stores and loads. These form the candidates for the +// transformation. The source value of each store then propagated to the user +// of the corresponding load. This makes the load dead. +// +// The pass can also version the loop and add memchecks in order to prove that +// may-aliasing stores can't change the value in memory before it's read by the +// load. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include <forward_list> + +#define LLE_OPTION "loop-load-elim" +#define DEBUG_TYPE LLE_OPTION + +using namespace llvm; + +static cl::opt<unsigned> CheckPerElim( + "runtime-check-per-loop-load-elim", cl::Hidden, + cl::desc("Max number of memchecks allowed per eliminated load on average"), + cl::init(1)); + +static cl::opt<unsigned> LoadElimSCEVCheckThreshold( + "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Load Elimination")); + + +STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); + +namespace { + +/// \brief Represent a store-to-forwarding candidate. +struct StoreToLoadForwardingCandidate { + LoadInst *Load; + StoreInst *Store; + + StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) + : Load(Load), Store(Store) {} + + /// \brief Return true if the dependence from the store to the load has a + /// distance of one. E.g. A[i+1] = A[i] + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { + Value *LoadPtr = Load->getPointerOperand(); + Value *StorePtr = Store->getPointerOperand(); + Type *LoadPtrType = LoadPtr->getType(); + Type *LoadType = LoadPtrType->getPointerElementType(); + + assert(LoadPtrType->getPointerAddressSpace() == + StorePtr->getType()->getPointerAddressSpace() && + LoadType == StorePtr->getType()->getPointerElementType() && + "Should be a known dependence"); + + auto &DL = Load->getParent()->getModule()->getDataLayout(); + unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); + + auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); + auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); + + // We don't need to check non-wrapping here because forward/backward + // dependence wouldn't be valid if these weren't monotonic accesses. + auto *Dist = cast<SCEVConstant>( + PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); + const APInt &Val = Dist->getAPInt(); + return Val.abs() == TypeByteSize; + } + + Value *getLoadPtr() const { return Load->getPointerOperand(); } + +#ifndef NDEBUG + friend raw_ostream &operator<<(raw_ostream &OS, + const StoreToLoadForwardingCandidate &Cand) { + OS << *Cand.Store << " -->\n"; + OS.indent(2) << *Cand.Load << "\n"; + return OS; + } +#endif +}; + +/// \brief Check if the store dominates all latches, so as long as there is no +/// intervening store this value will be loaded in the next iteration. +bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, + DominatorTree *DT) { + SmallVector<BasicBlock *, 8> Latches; + L->getLoopLatches(Latches); + return std::all_of(Latches.begin(), Latches.end(), + [&](const BasicBlock *Latch) { + return DT->dominates(StoreBlock, Latch); + }); +} + +/// \brief The per-loop class that does most of the work. +class LoadEliminationForLoop { +public: + LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, + DominatorTree *DT) + : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {} + + /// \brief Look through the loop-carried and loop-independent dependences in + /// this loop and find store->load dependences. + /// + /// Note that no candidate is returned if LAA has failed to analyze the loop + /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) + std::forward_list<StoreToLoadForwardingCandidate> + findStoreToLoadDependences(const LoopAccessInfo &LAI) { + std::forward_list<StoreToLoadForwardingCandidate> Candidates; + + const auto *Deps = LAI.getDepChecker().getDependences(); + if (!Deps) + return Candidates; + + // Find store->load dependences (consequently true dep). Both lexically + // forward and backward dependences qualify. Disqualify loads that have + // other unknown dependences. + + SmallSet<Instruction *, 4> LoadsWithUnknownDepedence; + + for (const auto &Dep : *Deps) { + Instruction *Source = Dep.getSource(LAI); + Instruction *Destination = Dep.getDestination(LAI); + + if (Dep.Type == MemoryDepChecker::Dependence::Unknown) { + if (isa<LoadInst>(Source)) + LoadsWithUnknownDepedence.insert(Source); + if (isa<LoadInst>(Destination)) + LoadsWithUnknownDepedence.insert(Destination); + continue; + } + + if (Dep.isBackward()) + // Note that the designations source and destination follow the program + // order, i.e. source is always first. (The direction is given by the + // DepType.) + std::swap(Source, Destination); + else + assert(Dep.isForward() && "Needs to be a forward dependence"); + + auto *Store = dyn_cast<StoreInst>(Source); + if (!Store) + continue; + auto *Load = dyn_cast<LoadInst>(Destination); + if (!Load) + continue; + Candidates.emplace_front(Load, Store); + } + + if (!LoadsWithUnknownDepedence.empty()) + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) { + return LoadsWithUnknownDepedence.count(C.Load); + }); + + return Candidates; + } + + /// \brief Return the index of the instruction according to program order. + unsigned getInstrIndex(Instruction *Inst) { + auto I = InstOrder.find(Inst); + assert(I != InstOrder.end() && "No index for instruction"); + return I->second; + } + + /// \brief If a load has multiple candidates associated (i.e. different + /// stores), it means that it could be forwarding from multiple stores + /// depending on control flow. Remove these candidates. + /// + /// Here, we rely on LAA to include the relevant loop-independent dependences. + /// LAA is known to omit these in the very simple case when the read and the + /// write within an alias set always takes place using the *same* pointer. + /// + /// However, we know that this is not the case here, i.e. we can rely on LAA + /// to provide us with loop-independent dependences for the cases we're + /// interested. Consider the case for example where a loop-independent + /// dependece S1->S2 invalidates the forwarding S3->S2. + /// + /// A[i] = ... (S1) + /// ... = A[i] (S2) + /// A[i+1] = ... (S3) + /// + /// LAA will perform dependence analysis here because there are two + /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). + void removeDependencesFromMultipleStores( + std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { + // If Store is nullptr it means that we have multiple stores forwarding to + // this store. + typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *> + LoadToSingleCandT; + LoadToSingleCandT LoadToSingleCand; + + for (const auto &Cand : Candidates) { + bool NewElt; + LoadToSingleCandT::iterator Iter; + + std::tie(Iter, NewElt) = + LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); + if (!NewElt) { + const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; + // Already multiple stores forward to this load. + if (OtherCand == nullptr) + continue; + + // Handle the very basic of case when the two stores are in the same + // block so deciding which one forwards is easy. The later one forwards + // as long as they both have a dependence distance of one to the load. + if (Cand.Store->getParent() == OtherCand->Store->getParent() && + Cand.isDependenceDistanceOfOne(PSE) && + OtherCand->isDependenceDistanceOfOne(PSE)) { + // They are in the same block, the later one will forward to the load. + if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) + OtherCand = &Cand; + } else + OtherCand = nullptr; + } + } + + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { + if (LoadToSingleCand[Cand.Load] != &Cand) { + DEBUG(dbgs() << "Removing from candidates: \n" << Cand + << " The load may have multiple stores forwarding to " + << "it\n"); + return true; + } + return false; + }); + } + + /// \brief Given two pointers operations by their RuntimePointerChecking + /// indices, return true if they require an alias check. + /// + /// We need a check if one is a pointer for a candidate load and the other is + /// a pointer for a possibly intervening store. + bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, + const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath, + const std::set<Value *> &CandLoadPtrs) { + Value *Ptr1 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; + Value *Ptr2 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; + return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || + (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); + } + + /// \brief Return pointers that are possibly written to on the path from a + /// forwarding store to a load. + /// + /// These pointers need to be alias-checked against the forwarding candidates. + SmallSet<Value *, 4> findPointersWrittenOnForwardingPath( + const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + // From FirstStore to LastLoad neither of the elimination candidate loads + // should overlap with any of the stores. + // + // E.g.: + // + // st1 C[i] + // ld1 B[i] <-------, + // ld0 A[i] <----, | * LastLoad + // ... | | + // st2 E[i] | | + // st3 B[i+1] -- | -' * FirstStore + // st0 A[i+1] ---' + // st4 D[i] + // + // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with + // ld0. + + LoadInst *LastLoad = + std::max_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Load) < getInstrIndex(B.Load); + }) + ->Load; + StoreInst *FirstStore = + std::min_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Store) < + getInstrIndex(B.Store); + }) + ->Store; + + // We're looking for stores after the first forwarding store until the end + // of the loop, then from the beginning of the loop until the last + // forwarded-to load. Collect the pointer for the stores. + SmallSet<Value *, 4> PtrsWrittenOnFwdingPath; + + auto InsertStorePtr = [&](Instruction *I) { + if (auto *S = dyn_cast<StoreInst>(I)) + PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); + }; + const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); + std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, + MemInstrs.end(), InsertStorePtr); + std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], + InsertStorePtr); + + return PtrsWrittenOnFwdingPath; + } + + /// \brief Determine the pointer alias checks to prove that there are no + /// intervening stores. + SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks( + const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + + SmallSet<Value *, 4> PtrsWrittenOnFwdingPath = + findPointersWrittenOnForwardingPath(Candidates); + + // Collect the pointers of the candidate loads. + // FIXME: SmallSet does not work with std::inserter. + std::set<Value *> CandLoadPtrs; + std::transform(Candidates.begin(), Candidates.end(), + std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), + std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr)); + + const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (auto PtrIdx1 : Check.first->Members) + for (auto PtrIdx2 : Check.second->Members) + if (needsChecking(PtrIdx1, PtrIdx2, + PtrsWrittenOnFwdingPath, CandLoadPtrs)) + return true; + return false; + }); + + DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + + return Checks; + } + + /// \brief Perform the transformation for a candidate. + void + propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, + SCEVExpander &SEE) { + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + Value *Ptr = Cand.Load->getPointerOperand(); + auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr)); + auto *PH = L->getLoopPreheader(); + Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), + PH->getTerminator()); + Value *Initial = + new LoadInst(InitialPtr, "load_initial", PH->getTerminator()); + PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", + &L->getHeader()->front()); + PHI->addIncoming(Initial, PH); + PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); + + Cand.Load->replaceAllUsesWith(PHI); + } + + /// \brief Top-level driver for each loop: find store->load forwarding + /// candidates, add run-time checks and perform transformation. + bool processLoop() { + DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); + // Look for store-to-load forwarding cases across the + // backedge. E.g.: + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + // First start with store->load dependences. + auto StoreToLoadDependences = findStoreToLoadDependences(LAI); + if (StoreToLoadDependences.empty()) + return false; + + // Generate an index for each load and store according to the original + // program order. This will be used later. + InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); + + // To keep things simple for now, remove those where the load is potentially + // fed by multiple stores. + removeDependencesFromMultipleStores(StoreToLoadDependences); + if (StoreToLoadDependences.empty()) + return false; + + // Filter the candidates further. + SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; + unsigned NumForwarding = 0; + for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + DEBUG(dbgs() << "Candidate " << Cand); + // Make sure that the stored values is available everywhere in the loop in + // the next iteration. + if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) + continue; + + // Check whether the SCEV difference is the same as the induction step, + // thus we load the value in the next iteration. + if (!Cand.isDependenceDistanceOfOne(PSE)) + continue; + + ++NumForwarding; + DEBUG(dbgs() + << NumForwarding + << ". Valid store-to-load forwarding across the loop backedge\n"); + Candidates.push_back(Cand); + } + if (Candidates.empty()) + return false; + + // Check intervening may-alias stores. These need runtime checks for alias + // disambiguation. + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks = + collectMemchecks(Candidates); + + // Too many checks are likely to outweigh the benefits of forwarding. + if (Checks.size() > Candidates.size() * CheckPerElim) { + DEBUG(dbgs() << "Too many run-time checks needed.\n"); + return false; + } + + if (LAI.PSE.getUnionPredicate().getComplexity() > + LoadElimSCEVCheckThreshold) { + DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + return false; + } + + // Point of no-return, start the transformation. First, version the loop if + // necessary. + if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) { + LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); + LV.setAliasChecks(std::move(Checks)); + LV.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LV.versionLoop(); + } + + // Next, propagate the value stored by the store to the users of the load. + // Also for the first iteration, generate the initial value of the load. + SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), + "storeforward"); + for (const auto &Cand : Candidates) + propagateStoredValueToLoadUsers(Cand, SEE); + NumLoopLoadEliminted += NumForwarding; + + return true; + } + +private: + Loop *L; + + /// \brief Maps the load/store instructions to their index according to + /// program order. + DenseMap<Instruction *, unsigned> InstOrder; + + // Analyses used. + LoopInfo *LI; + const LoopAccessInfo &LAI; + DominatorTree *DT; + PredicatedScalarEvolution PSE; +}; + +/// \brief The pass. Most of the work is delegated to the per-loop +/// LoadEliminationForLoop class. +class LoopLoadElimination : public FunctionPass { +public: + LoopLoadElimination() : FunctionPass(ID) { + initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *LAA = &getAnalysis<LoopAccessAnalysis>(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + // The actual work is performed by LoadEliminationForLoop. + LoadEliminationForLoop LEL(L, LI, LAI, DT); + Changed |= LEL.processLoop(); + } + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + static char ID; +}; +} + +char LoopLoadElimination::ID; +static const char LLE_name[] = "Loop Load Elimination"; + +INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) + +namespace llvm { +FunctionPass *createLoopLoadEliminationPass() { + return new LoopLoadElimination(); +} +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index ed103e6b8ed6..27c2d8824df0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -147,12 +147,12 @@ namespace { bool runOnLoop(Loop *L, LPPassManager &LPM) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -162,11 +162,15 @@ namespace { ScalarEvolution *SE; TargetLibraryInfo *TLI; DominatorTree *DT; + bool PreserveLCSSA; typedef SmallVector<Instruction *, 16> SmallInstructionVector; typedef SmallSet<Instruction *, 16> SmallInstructionSet; - // A chain of isomorphic instructions, indentified by a single-use PHI, + // Map between induction variable and its increment + DenseMap<Instruction *, int64_t> IVToIncMap; + + // A chain of isomorphic instructions, identified by a single-use PHI // representing a reduction. Only the last value may be used outside the // loop. struct SimpleLoopReduction { @@ -300,22 +304,6 @@ namespace { // The functions below can be called after we've finished processing all // instructions in the loop, and we know which reductions were selected. - // Is the provided instruction the PHI of a reduction selected for - // rerolling? - bool isSelectedPHI(Instruction *J) { - if (!isa<PHINode>(J)) - return false; - - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - if (cast<Instruction>(J) == PossibleReds[i].getPHI()) - return true; - } - - return false; - } - bool validateSelected(); void replaceSelected(); @@ -335,7 +323,7 @@ namespace { // x[i*3+1] = y2 // x[i*3+2] = y3 // - // Base instruction -> i*3 + // Base instruction -> i*3 // +---+----+ // / | \ // ST[y1] +1 +2 <-- Roots @@ -366,8 +354,11 @@ namespace { struct DAGRootTracker { DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, ScalarEvolution *SE, AliasAnalysis *AA, - TargetLibraryInfo *TLI) - : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {} + TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA, + DenseMap<Instruction *, int64_t> &IncrMap) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), + PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {} /// Stage 1: Find all the DAG roots for the induction variable. bool findRoots(); @@ -413,11 +404,14 @@ namespace { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; + DominatorTree *DT; + LoopInfo *LI; + bool PreserveLCSSA; // The loop induction variable. Instruction *IV; // Loop step amount. - uint64_t Inc; + int64_t Inc; // Loop reroll count; if Inc == 1, this records the scaling applied // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; // If Inc is not 1, Scale = Inc. @@ -430,6 +424,8 @@ namespace { // they are used in (or specially, IL_All for instructions // used in the loop increment mechanism). UsesTy Uses; + // Map between induction variable and its increment + DenseMap<Instruction *, int64_t> &IVToIncMap; }; void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); @@ -442,10 +438,10 @@ namespace { char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -477,21 +473,20 @@ void LoopReroll::collectPossibleIVs(Loop *L, continue; if (const SCEVAddRecExpr *PHISCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) { if (PHISCEV->getLoop() != L) continue; if (!PHISCEV->isAffine()) continue; if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { - if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + const APInt &AInt = IncSCEV->getAPInt().abs(); + if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc)) continue; - if (IncSCEV->getValue()->uge(MaxInc)) - continue; - - DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << - *PHISCEV << "\n"); - PossibleIVs.push_back(I); + IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); + DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV + << "\n"); + PossibleIVs.push_back(&*I); } } } @@ -552,7 +547,7 @@ void LoopReroll::collectPossibleReductions(Loop *L, if (!I->getType()->isSingleValueType()) continue; - SimpleLoopReduction SLR(I, L); + SimpleLoopReduction SLR(&*I, L); if (!SLR.valid()) continue; @@ -699,17 +694,11 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { } } - int64_t V = CI->getValue().getSExtValue(); + int64_t V = std::abs(CI->getValue().getSExtValue()); if (Roots.find(V) != Roots.end()) // No duplicates, please. return false; - // FIXME: Add support for negative values. - if (V < 0) { - DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); - return false; - } - Roots[V] = cast<Instruction>(I); } @@ -731,7 +720,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { unsigned NumBaseUses = BaseUsers.size(); if (NumBaseUses == 0) NumBaseUses = Roots.begin()->second->getNumUses(); - + // Check that every node has the same number of users. for (auto &KV : Roots) { if (KV.first == 0) @@ -744,7 +733,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { } } - return true; + return true; } bool LoopReroll::DAGRootTracker:: @@ -787,7 +776,7 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { if (!collectPossibleRoots(IVU, V)) return false; - // If we didn't get a root for index zero, then IVU must be + // If we didn't get a root for index zero, then IVU must be // subsumed. if (V.find(0) == V.end()) SubsumedInsts.insert(IVU); @@ -818,13 +807,10 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { } bool LoopReroll::DAGRootTracker::findRoots() { - - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); - Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> - getValue()->getZExtValue(); + Inc = IVToIncMap[IV]; assert(RootSets.empty() && "Unclean state!"); - if (Inc == 1) { + if (std::abs(Inc) == 1) { for (auto *IVU : IV->users()) { if (isLoopIncrement(IVU, IV)) LoopIncs.push_back(cast<Instruction>(IVU)); @@ -996,6 +982,25 @@ bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, return false; } +static bool isIgnorableInst(const Instruction *I) { + if (isa<DbgInfoIntrinsic>(I)) + return true; + const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + switch (II->getIntrinsicID()) { + default: + return false; + case llvm::Intrinsic::annotation: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + // TODO: the following intrinsics may also be whitelisted: + // lifetime_start, lifetime_end, invariant_start, invariant_end + return true; + } + return false; +} + bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // We now need to check for equivalence of the use graph of each root with // that of the primary induction variable (excluding the roots). Our goal @@ -1029,7 +1034,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // Make sure all instructions in the loop are in one and only one // set. for (auto &KV : Uses) { - if (KV.second.count() != 1) { + if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " << *KV.first << " (#uses=" << KV.second.count() << ")\n"); return false; @@ -1103,15 +1108,15 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { " vs. " << *RootInst << "\n"); return false; } - + RootIt = TryIt; RootInst = TryIt->first; } // All instructions between the last root and this root - // may belong to some other iteration. If they belong to a + // may belong to some other iteration. If they belong to a // future iteration, then they're dangerous to alias with. - // + // // Note that because we allow a limited amount of flexibility in the order // that we visit nodes, LastRootIt might be *before* RootIt, in which // case we've already checked this set of instructions so we shouldn't @@ -1267,6 +1272,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { ++J; } + bool Negative = IVToIncMap[IV] < 0; const DataLayout &DL = Header->getModule()->getDataLayout(); // We need to create a new induction variable for each different BaseInst. @@ -1275,13 +1281,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); const SCEV *Start = RealIVSCEV->getStart(); - const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> - (SE->getAddRecExpr(Start, - SE->getConstant(RealIVSCEV->getType(), 1), - L, SCEV::FlagAnyWrap)); + const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr( + Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L, + SCEV::FlagAnyWrap)); { // Limit the lifetime of SCEVExpander. SCEVExpander Expander(*SE, DL, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); + Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front()); for (auto &KV : Uses) { if (KV.second.find_first() == 0) @@ -1294,8 +1299,8 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = - SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); + const SCEV *ICMinus1SCEV = SE->getMinusSCEV( + ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1)); Value *ICMinus1; // Iteration count minus 1 if (isa<SCEVConstant>(ICMinus1SCEV)) { @@ -1303,7 +1308,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { } else { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) - Preheader = InsertPreheaderForLoop(L, Parent); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), Preheader->getTerminator()); @@ -1444,13 +1449,14 @@ void LoopReroll::ReductionTracker::replaceSelected() { bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { - DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI); + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, + IVToIncMap); if (!DAGRoots.findRoots()) return false; DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV << "\n"); - + if (!DAGRoots.validate(Reductions)) return false; if (!Reductions.validateSelected()) @@ -1469,11 +1475,12 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << @@ -1490,13 +1497,13 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; const SCEV *LIBETC = SE->getBackedgeTakenCount(L); - const SCEV *IterCount = - SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType())); DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); // First, we need to find the induction variable with respect to which we can // reroll (there may be several possible options). SmallInstructionVector PossibleIVs; + IVToIncMap.clear(); collectPossibleIVs(L, PossibleIVs); if (PossibleIVs.empty()) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index a675e1289baf..5e6c2da08cc3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,11 +13,15 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -41,95 +45,6 @@ DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, cl::desc("The default maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); -namespace { - - class LoopRotate : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { - initializeLoopRotatePass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); - } - - // LCSSA form makes instruction renaming easier. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L, bool SimplifiedLatch); - - private: - unsigned MaxHeaderSize; - LoopInfo *LI; - const TargetTransformInfo *TTI; - AssumptionCache *AC; - DominatorTree *DT; - }; -} - -char LoopRotate::ID = 0; -INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) - -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { - return new LoopRotate(MaxHeaderSize); -} - -/// Rotate Loop L as many times as possible. Return true if -/// the loop is rotated at least once. -bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - // Save the loop metadata. - MDNode *LoopMD = L->getLoopID(); - - Function &F = *L->getHeader()->getParent(); - - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - - // Simplify the loop latch before attempting to rotate the header - // upward. Rotation may not be needed if the loop tail can be folded into the - // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L); - - // One loop can be rotated multiple times. - bool MadeChange = false; - while (rotateLoop(L, SimplifiedLatch)) { - MadeChange = true; - SimplifiedLatch = false; - } - - // Restore the loop metadata. - // NB! We presume LoopRotation DOESN'T ADD its own metadata. - if ((MadeChange || SimplifiedLatch) && LoopMD) - L->setLoopID(LoopMD); - - return MadeChange; -} /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the /// old header into the preheader. If there were uses of the values produced by @@ -147,7 +62,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // as necessary. SSAUpdater SSA; for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = I; + Value *OrigHeaderVal = &*I; // If there are no uses of the value (e.g. because it returns void), there // is nothing to rewrite. @@ -196,127 +111,6 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End, Loop *L) { - bool seenIncrement = false; - bool MultiExitLoop = false; - - if (!L->getExitingBlock()) - MultiExitLoop = true; - - for (BasicBlock::iterator I = Begin; I != End; ++I) { - - if (!isSafeToSpeculativelyExecute(I)) - return false; - - if (isa<DbgInfoIntrinsic>(I)) - continue; - - switch (I->getOpcode()) { - default: - return false; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast<GEPOperator>(I)->hasAllConstantIndices()) - return false; - // fall-thru to increment case - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: { - Value *IVOpnd = !isa<Constant>(I->getOperand(0)) - ? I->getOperand(0) - : !isa<Constant>(I->getOperand(1)) - ? I->getOperand(1) - : nullptr; - if (!IVOpnd) - return false; - - // If increment operand is used outside of the loop, this speculation - // could cause extra live range interference. - if (MultiExitLoop) { - for (User *UseI : IVOpnd->users()) { - auto *UserInst = cast<Instruction>(UseI); - if (!L->contains(UserInst)) - return false; - } - } - - if (seenIncrement) - return false; - seenIncrement = true; - break; - } - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - // ignore type conversions - break; - } - } - return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || Latch->hasAddressTaken()) - return false; - - BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); - if (!Jmp || !Jmp->isUnconditional()) - return false; - - BasicBlock *LastExit = Latch->getSinglePredecessor(); - if (!LastExit || !L->isLoopExiting(LastExit)) - return false; - - BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); - if (!BI) - return false; - - if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L)) - return false; - - DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); - - // Hoist the instructions from Latch into LastExit. - LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp); - - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); - - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); - - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); - return true; -} - /// Rotate loop LP. Return true if the loop is rotated. /// /// \param SimplifiedLatch is true if the latch was just folded into the final @@ -327,7 +121,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { +static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE, + bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; @@ -382,7 +179,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Anything ScalarEvolution may know about this loop or the PHI nodes // in its header will soon be invalidated. - if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + if (SE) SE->forgetLoop(L); DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); @@ -420,7 +217,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); while (I != E) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // If the instruction's operands are invariant and it doesn't read or write // memory, then it is safe to hoist. Doing this doesn't change the order of @@ -465,8 +262,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's // successors by duplicating their incoming values for OrigHeader. TerminatorInst *TI = OrigHeader->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); + for (BasicBlock *SuccBB : TI->successors()) + for (BasicBlock::iterator BI = SuccBB->begin(); PHINode *PN = dyn_cast<PHINode>(BI); ++BI) PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); @@ -607,3 +404,221 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { ++NumRotated; return true; } + +/// Determine whether the instructions in this range may be safely and cheaply +/// speculated. This is not an important enough situation to develop complex +/// heuristics. We handle a single arithmetic instruction along with any type +/// conversions. +static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, + BasicBlock::iterator End, Loop *L) { + bool seenIncrement = false; + bool MultiExitLoop = false; + + if (!L->getExitingBlock()) + MultiExitLoop = true; + + for (BasicBlock::iterator I = Begin; I != End; ++I) { + + if (!isSafeToSpeculativelyExecute(&*I)) + return false; + + if (isa<DbgInfoIntrinsic>(I)) + continue; + + switch (I->getOpcode()) { + default: + return false; + case Instruction::GetElementPtr: + // GEPs are cheap if all indices are constant. + if (!cast<GEPOperator>(I)->hasAllConstantIndices()) + return false; + // fall-thru to increment case + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: { + Value *IVOpnd = !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) + ? I->getOperand(1) + : nullptr; + if (!IVOpnd) + return false; + + // If increment operand is used outside of the loop, this speculation + // could cause extra live range interference. + if (MultiExitLoop) { + for (User *UseI : IVOpnd->users()) { + auto *UserInst = cast<Instruction>(UseI); + if (!L->contains(UserInst)) + return false; + } + } + + if (seenIncrement) + return false; + seenIncrement = true; + break; + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // ignore type conversions + break; + } + } + return true; +} + +/// Fold the loop tail into the loop exit by speculating the loop tail +/// instructions. Typically, this is a single post-increment. In the case of a +/// simple 2-block loop, hoisting the increment can be much better than +/// duplicating the entire loop header. In the case of loops with early exits, +/// rotation will not work anyway, but simplifyLoopLatch will put the loop in +/// canonical form so downstream passes can handle it. +/// +/// I don't believe this invalidates SCEV. +static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || Latch->hasAddressTaken()) + return false; + + BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!Jmp || !Jmp->isUnconditional()) + return false; + + BasicBlock *LastExit = Latch->getSinglePredecessor(); + if (!LastExit || !L->isLoopExiting(LastExit)) + return false; + + BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); + if (!BI) + return false; + + if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) + return false; + + DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " + << LastExit->getName() << "\n"); + + // Hoist the instructions from Latch into LastExit. + LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), + Latch->begin(), Jmp->getIterator()); + + unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; + BasicBlock *Header = Jmp->getSuccessor(0); + assert(Header == L->getHeader() && "expected a backward branch"); + + // Remove Latch from the CFG so that LastExit becomes the new Latch. + BI->setSuccessor(FallThruPath, Header); + Latch->replaceSuccessorsPhiUsesWith(LastExit); + Jmp->eraseFromParent(); + + // Nuke the Latch block. + assert(Latch->empty() && "unable to evacuate Latch"); + LI->removeBlock(Latch); + if (DT) + DT->eraseNode(Latch); + Latch->eraseFromParent(); + return true; +} + +/// Rotate \c L as many times as possible. Return true if the loop is rotated +/// at least once. +static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, + AssumptionCache *AC, DominatorTree *DT, + ScalarEvolution *SE) { + // Save the loop metadata. + MDNode *LoopMD = L->getLoopID(); + + // Simplify the loop latch before attempting to rotate the header + // upward. Rotation may not be needed if the loop tail can be folded into the + // loop exit. + bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT); + + // One loop can be rotated multiple times. + bool MadeChange = false; + while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) { + MadeChange = true; + SimplifiedLatch = false; + } + + // Restore the loop metadata. + // NB! We presume LoopRotation DOESN'T ADD its own metadata. + if ((MadeChange || SimplifiedLatch) && LoopMD) + L->setLoopID(LoopMD); + + return MadeChange; +} + +namespace { + +class LoopRotate : public LoopPass { + unsigned MaxHeaderSize; + +public: + static char ID; // Pass ID, replacement for typeid + LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + if (SpecifiedMaxHeaderSize == -1) + MaxHeaderSize = DefaultRotationThreshold; + else + MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); + } + + // LCSSA form makes instruction renaming easier. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipOptnoneFunction(L)) + return false; + Function &F = *L->getHeader()->getParent(); + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + auto *SE = SEWP ? &SEWP->getSE() : nullptr; + + return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE); + } +}; +} + +char LoopRotate::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) + +Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { + return new LoopRotate(MaxHeaderSize); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 4b59f3d2f6cc..2101225ed9f7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -105,10 +105,33 @@ static bool StressIVChain = false; namespace { -/// RegSortData - This class holds data which is used to order reuse candidates. +struct MemAccessTy { + /// Used in situations where the accessed memory type is unknown. + static const unsigned UnknownAddressSpace = ~0u; + + Type *MemTy; + unsigned AddrSpace; + + MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {} + + MemAccessTy(Type *Ty, unsigned AS) : + MemTy(Ty), AddrSpace(AS) {} + + bool operator==(MemAccessTy Other) const { + return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; + } + + bool operator!=(MemAccessTy Other) const { return !(*this == Other); } + + static MemAccessTy getUnknown(LLVMContext &Ctx) { + return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace); + } +}; + +/// This class holds data which is used to order reuse candidates. class RegSortData { public: - /// UsedByIndices - This represents the set of LSRUse indices which reference + /// This represents the set of LSRUse indices which reference /// a particular register. SmallBitVector UsedByIndices; @@ -122,16 +145,14 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegSortData::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// RegUseTracker - Map register candidates to information about how they are -/// used. +/// Map register candidates to information about how they are used. class RegUseTracker { typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; @@ -139,9 +160,9 @@ class RegUseTracker { SmallVector<const SCEV *, 16> RegSequence; public: - void CountRegister(const SCEV *Reg, size_t LUIdx); - void DropRegister(const SCEV *Reg, size_t LUIdx); - void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); + void countRegister(const SCEV *Reg, size_t LUIdx); + void dropRegister(const SCEV *Reg, size_t LUIdx); + void swapAndDropUse(size_t LUIdx, size_t LastLUIdx); bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -160,7 +181,7 @@ public: } void -RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) { std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.insert(std::make_pair(Reg, RegSortData())); RegSortData &RSD = Pair.first->second; @@ -171,7 +192,7 @@ RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) { RegUsesTy::iterator It = RegUsesMap.find(Reg); assert(It != RegUsesMap.end()); RegSortData &RSD = It->second; @@ -180,7 +201,7 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { +RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) { assert(LUIdx <= LastLUIdx); // Update RegUses. The data structure is not optimized for this purpose; @@ -219,9 +240,8 @@ void RegUseTracker::clear() { namespace { -/// Formula - This class holds information that describes a formula for -/// computing satisfying a use. It may include broken-out immediates and scaled -/// registers. +/// This class holds information that describes a formula for computing +/// satisfying a use. It may include broken-out immediates and scaled registers. struct Formula { /// Global base address used for complex addressing. GlobalValue *BaseGV; @@ -235,8 +255,8 @@ struct Formula { /// The scale of any complex addressing. int64_t Scale; - /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty. The canonical representation of a formula is + /// The list of "base" registers for this use. When this is non-empty. The + /// canonical representation of a formula is /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). /// #1 enforces that the scaled register is always used when at least two @@ -247,31 +267,31 @@ struct Formula { /// form. SmallVector<const SCEV *, 4> BaseRegs; - /// ScaledReg - The 'scaled' register for this use. This should be non-null - /// when Scale is not zero. + /// The 'scaled' register for this use. This should be non-null when Scale is + /// not zero. const SCEV *ScaledReg; - /// UnfoldedOffset - An additional constant offset which added near the - /// use. This requires a temporary register, but the offset itself can - /// live in an add immediate field rather than a register. + /// An additional constant offset which added near the use. This requires a + /// temporary register, but the offset itself can live in an add immediate + /// field rather than a register. int64_t UnfoldedOffset; Formula() : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(nullptr), UnfoldedOffset(0) {} - void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); + void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); bool isCanonical() const; - void Canonicalize(); + void canonicalize(); - bool Unscale(); + bool unscale(); size_t getNumRegs() const; Type *getType() const; - void DeleteBaseReg(const SCEV *&S); + void deleteBaseReg(const SCEV *&S); bool referencesReg(const SCEV *S) const; bool hasRegsUsedByUsesOtherThan(size_t LUIdx, @@ -283,7 +303,7 @@ struct Formula { } -/// DoInitialMatch - Recursion helper for InitialMatch. +/// Recursion helper for initialMatch. static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl<const SCEV *> &Good, SmallVectorImpl<const SCEV *> &Bad, @@ -336,10 +356,9 @@ static void DoInitialMatch(const SCEV *S, Loop *L, Bad.push_back(S); } -/// InitialMatch - Incorporate loop-variant parts of S into this Formula, -/// attempting to keep all loop-invariant and loop-computable values in a -/// single base register. -void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { +/// Incorporate loop-variant parts of S into this Formula, attempting to keep +/// all loop-invariant and loop-computable values in a single base register. +void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { SmallVector<const SCEV *, 4> Good; SmallVector<const SCEV *, 4> Bad; DoInitialMatch(S, L, Good, Bad, SE); @@ -355,7 +374,7 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } - Canonicalize(); + canonicalize(); } /// \brief Check whether or not this formula statisfies the canonical @@ -373,7 +392,7 @@ bool Formula::isCanonical() const { /// field. Otherwise, we would have to do special cases everywhere in LSR /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... /// On the other hand, 1*reg should be canonicalized into reg. -void Formula::Canonicalize() { +void Formula::canonicalize() { if (isCanonical()) return; // So far we did not need this case. This is easy to implement but it is @@ -394,7 +413,7 @@ void Formula::Canonicalize() { /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. /// \return true if it was possible to get rid of the scale, false otherwise. /// \note After this operation the formula may not be in the canonical form. -bool Formula::Unscale() { +bool Formula::unscale() { if (Scale != 1) return false; Scale = 0; @@ -403,15 +422,14 @@ bool Formula::Unscale() { return true; } -/// getNumRegs - Return the total number of register operands used by this -/// formula. This does not include register uses implied by non-constant -/// addrec strides. +/// Return the total number of register operands used by this formula. This does +/// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { return !!ScaledReg + BaseRegs.size(); } -/// getType - Return the type of this formula, if it has one, or null -/// otherwise. This type is meaningless except for the bit size. +/// Return the type of this formula, if it has one, or null otherwise. This type +/// is meaningless except for the bit size. Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : @@ -419,21 +437,21 @@ Type *Formula::getType() const { nullptr; } -/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. -void Formula::DeleteBaseReg(const SCEV *&S) { +/// Delete the given base reg from the BaseRegs list. +void Formula::deleteBaseReg(const SCEV *&S) { if (&S != &BaseRegs.back()) std::swap(S, BaseRegs.back()); BaseRegs.pop_back(); } -/// referencesReg - Test if this formula references the given register. +/// Test if this formula references the given register. bool Formula::referencesReg(const SCEV *S) const { return S == ScaledReg || std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); } -/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers -/// which are used by uses other than the use with the given index. +/// Test whether this formula uses registers which are used by uses other than +/// the use with the given index. bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, const RegUseTracker &RegUses) const { if (ScaledReg) @@ -481,30 +499,29 @@ void Formula::print(raw_ostream &OS) const { } } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Formula::dump() const { print(errs()); errs() << '\n'; } -#endif -/// isAddRecSExtable - Return true if the given addrec can be sign-extended -/// without changing its value. +/// Return true if the given addrec can be sign-extended without changing its +/// value. static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); } -/// isAddSExtable - Return true if the given add can be sign-extended -/// without changing its value. +/// Return true if the given add can be sign-extended without changing its +/// value. static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); } -/// isMulSExtable - Return true if the given mul can be sign-extended -/// without changing its value. +/// Return true if the given mul can be sign-extended without changing its +/// value. static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), @@ -512,12 +529,11 @@ static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); } -/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined -/// and if the remainder is known to be zero, or null otherwise. If -/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified -/// to Y, ignoring that the multiplication may overflow, which is useful when -/// the result will be used in a context where the most significant bits are -/// ignored. +/// Return an expression for LHS /s RHS, if it can be determined and if the +/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits +/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that +/// the multiplication may overflow, which is useful when the result will be +/// used in a context where the most significant bits are ignored. static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits = false) { @@ -528,7 +544,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, // Handle a few RHS special cases. const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); if (RC) { - const APInt &RA = RC->getValue()->getValue(); + const APInt &RA = RC->getAPInt(); // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do // some folding. if (RA.isAllOnesValue()) @@ -542,8 +558,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { if (!RC) return nullptr; - const APInt &LA = C->getValue()->getValue(); - const APInt &RA = RC->getValue()->getValue(); + const APInt &LA = C->getAPInt(); + const APInt &RA = RC->getAPInt(); if (LA.srem(RA) != 0) return nullptr; return SE.getConstant(LA.sdiv(RA)); @@ -603,12 +619,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, return nullptr; } -/// ExtractImmediate - If S involves the addition of a constant integer value, -/// return that integer value, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a constant integer value, return that integer +/// value, and mutate S to point to a new SCEV with that value excluded. static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { - if (C->getValue()->getValue().getMinSignedBits() <= 64) { + if (C->getAPInt().getMinSignedBits() <= 64) { S = SE.getConstant(C->getType(), 0); return C->getValue()->getSExtValue(); } @@ -630,9 +645,8 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { return 0; } -/// ExtractSymbol - If S involves the addition of a GlobalValue address, -/// return that symbol, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a GlobalValue address, return that symbol, and +/// mutate S to point to a new SCEV with that value excluded. static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { @@ -657,8 +671,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { return nullptr; } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. +/// Returns true if the specified instruction is using the specified value as an +/// address. static bool isAddressUse(Instruction *Inst, Value *OperandVal) { bool isAddress = isa<LoadInst>(Inst); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { @@ -682,12 +696,15 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { return isAddress; } -/// getAccessType - Return the type of the memory being accessed. -static Type *getAccessType(const Instruction *Inst) { - Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { +/// Return the type of the memory being accessed. +static MemAccessTy getAccessType(const Instruction *Inst) { + MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + AccessTy.MemTy = SI->getOperand(0)->getType(); + AccessTy.AddrSpace = SI->getPointerAddressSpace(); + } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + AccessTy.AddrSpace = LI->getPointerAddressSpace(); + } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { @@ -696,21 +713,21 @@ static Type *getAccessType(const Instruction *Inst) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getArgOperand(0)->getType(); + AccessTy.MemTy = II->getArgOperand(0)->getType(); break; } } // All pointers have the same requirements, so canonicalize them to an // arbitrary pointer type to minimize variation. - if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) - AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), - PTy->getAddressSpace()); + if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy)) + AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), + PTy->getAddressSpace()); return AccessTy; } -/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +/// Return true if this AddRec is already a phi in its loop. static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) { @@ -793,9 +810,8 @@ static bool isHighCostExpansion(const SCEV *S, return true; } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. +/// If any of the instructions is the specified set are trivially dead, delete +/// them and see if this makes any of their operands subsequently dead. static bool DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { bool Changed = false; @@ -842,7 +858,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, namespace { -/// Cost - This class is used to measure and compare candidate formulae. +/// This class is used to measure and compare candidate formulae. class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. @@ -905,7 +921,7 @@ private: } -/// RateRegister - Tally up interesting quantities from the given register. +/// Tally up interesting quantities from the given register. void Cost::RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, @@ -951,9 +967,9 @@ void Cost::RateRegister(const SCEV *Reg, SE.hasComputableLoopEvolution(Reg, L); } -/// RatePrimaryRegister - Record this register in the set. If we haven't seen it -/// before, rate it. Optional LoserRegs provides a way to declare any formula -/// that refers to one of those regs an instant loser. +/// Record this register in the set. If we haven't seen it before, rate +/// it. Optional LoserRegs provides a way to declare any formula that refers to +/// one of those regs an instant loser. void Cost::RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, @@ -1024,7 +1040,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, assert(isValid() && "invalid cost"); } -/// Lose - Set this cost to a losing value. +/// Set this cost to a losing value. void Cost::Lose() { NumRegs = ~0u; AddRecCost = ~0u; @@ -1035,7 +1051,7 @@ void Cost::Lose() { ScaleCost = ~0u; } -/// operator< - Choose the lower cost. +/// Choose the lower cost. bool Cost::operator<(const Cost &Other) const { return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, ImmCost, SetupCost) < @@ -1061,37 +1077,35 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Cost::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// LSRFixup - An operand value in an instruction which is to be replaced -/// with some equivalent, possibly strength-reduced, replacement. +/// An operand value in an instruction which is to be replaced with some +/// equivalent, possibly strength-reduced, replacement. struct LSRFixup { - /// UserInst - The instruction which will be updated. + /// The instruction which will be updated. Instruction *UserInst; - /// OperandValToReplace - The operand of the instruction which will - /// be replaced. The operand may be used more than once; every instance - /// will be replaced. + /// The operand of the instruction which will be replaced. The operand may be + /// used more than once; every instance will be replaced. Value *OperandValToReplace; - /// PostIncLoops - If this user is to use the post-incremented value of an - /// induction variable, this variable is non-null and holds the loop - /// associated with the induction variable. + /// If this user is to use the post-incremented value of an induction + /// variable, this variable is non-null and holds the loop associated with the + /// induction variable. PostIncLoopSet PostIncLoops; - /// LUIdx - The index of the LSRUse describing the expression which - /// this fixup needs, minus an offset (below). + /// The index of the LSRUse describing the expression which this fixup needs, + /// minus an offset (below). size_t LUIdx; - /// Offset - A constant offset to be added to the LSRUse expression. - /// This allows multiple fixups to share the same LSRUse with different - /// offsets, for example in an unrolled loop. + /// A constant offset to be added to the LSRUse expression. This allows + /// multiple fixups to share the same LSRUse with different offsets, for + /// example in an unrolled loop. int64_t Offset; bool isUseFullyOutsideLoop(const Loop *L) const; @@ -1108,8 +1122,7 @@ LSRFixup::LSRFixup() : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), Offset(0) {} -/// isUseFullyOutsideLoop - Test whether this fixup always uses its -/// value outside of the given loop. +/// Test whether this fixup always uses its value outside of the given loop. bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { // PHI nodes use their value in their incoming blocks. if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { @@ -1149,16 +1162,15 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRFixup::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding -/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. +/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted +/// SmallVectors of const SCEV*. struct UniquifierDenseMapInfo { static SmallVector<const SCEV *, 4> getEmptyKey() { SmallVector<const SCEV *, 4> V; @@ -1182,17 +1194,17 @@ struct UniquifierDenseMapInfo { } }; -/// LSRUse - This class holds the state that LSR keeps for each use in -/// IVUsers, as well as uses invented by LSR itself. It includes information -/// about what kinds of things can be folded into the user, information about -/// the user itself, and information about how the use may be satisfied. -/// TODO: Represent multiple users of the same expression in common? +/// This class holds the state that LSR keeps for each use in IVUsers, as well +/// as uses invented by LSR itself. It includes information about what kinds of +/// things can be folded into the user, information about the user itself, and +/// information about how the use may be satisfied. TODO: Represent multiple +/// users of the same expression in common? class LSRUse { DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; public: - /// KindType - An enum for a kind of use, indicating what types of - /// scaled and immediate operands it might support. + /// An enum for a kind of use, indicating what types of scaled and immediate + /// operands it might support. enum KindType { Basic, ///< A normal use, with no folding. Special, ///< A special case of basic, allowing -1 scales. @@ -1204,15 +1216,14 @@ public: typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; KindType Kind; - Type *AccessTy; + MemAccessTy AccessTy; SmallVector<int64_t, 8> Offsets; int64_t MinOffset; int64_t MaxOffset; - /// AllFixupsOutsideLoop - This records whether all of the fixups using this - /// LSRUse are outside of the loop, in which case some special-case heuristics - /// may be used. + /// This records whether all of the fixups using this LSRUse are outside of + /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop; /// RigidFormula is set to true to guarantee that this use will be associated @@ -1222,26 +1233,24 @@ public: /// changing the formula. bool RigidFormula; - /// WidestFixupType - This records the widest use type for any fixup using - /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different - /// max fixup widths to be equivalent, because the narrower one may be relying - /// on the implicit truncation to truncate away bogus bits. + /// This records the widest use type for any fixup using this + /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max + /// fixup widths to be equivalent, because the narrower one may be relying on + /// the implicit truncation to truncate away bogus bits. Type *WidestFixupType; - /// Formulae - A list of ways to build a value that can satisfy this user. - /// After the list is populated, one of these is selected heuristically and - /// used to formulate a replacement for OperandValToReplace in UserInst. + /// A list of ways to build a value that can satisfy this user. After the + /// list is populated, one of these is selected heuristically and used to + /// formulate a replacement for OperandValToReplace in UserInst. SmallVector<Formula, 12> Formulae; - /// Regs - The set of register candidates used by all formulae in this LSRUse. + /// The set of register candidates used by all formulae in this LSRUse. SmallPtrSet<const SCEV *, 4> Regs; - LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), - MinOffset(INT64_MAX), - MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), - RigidFormula(false), - WidestFixupType(nullptr) {} + LSRUse(KindType K, MemAccessTy AT) + : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), RigidFormula(false), + WidestFixupType(nullptr) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); @@ -1254,8 +1263,8 @@ public: } -/// HasFormula - Test whether this use as a formula which has the same -/// registers as the given formula. +/// Test whether this use as a formula which has the same registers as the given +/// formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); @@ -1264,9 +1273,8 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { return Uniquifier.count(Key); } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. -/// The formula must be in canonical form. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise. The formula must be in canonical form. bool LSRUse::InsertFormula(const Formula &F) { assert(F.isCanonical() && "Invalid canonical representation"); @@ -1300,14 +1308,14 @@ bool LSRUse::InsertFormula(const Formula &F) { return true; } -/// DeleteFormula - Remove the given formula from this use's list. +/// Remove the given formula from this use's list. void LSRUse::DeleteFormula(Formula &F) { if (&F != &Formulae.back()) std::swap(F, Formulae.back()); Formulae.pop_back(); } -/// RecomputeRegs - Recompute the Regs field, and update RegUses. +/// Recompute the Regs field, and update RegUses. void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Now that we've filtered out some formulae, recompute the Regs set. SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); @@ -1320,7 +1328,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Update the RegTracker. for (const SCEV *S : OldRegs) if (!Regs.count(S)) - RegUses.DropRegister(S, LUIdx); + RegUses.dropRegister(S, LUIdx); } void LSRUse::print(raw_ostream &OS) const { @@ -1331,10 +1339,13 @@ void LSRUse::print(raw_ostream &OS) const { case ICmpZero: OS << "ICmpZero"; break; case Address: OS << "Address of "; - if (AccessTy->isPointerTy()) + if (AccessTy.MemTy->isPointerTy()) OS << "pointer"; // the full pointer type could be really verbose - else - OS << *AccessTy; + else { + OS << *AccessTy.MemTy; + } + + OS << " in addrspace(" << AccessTy.AddrSpace << ')'; } OS << ", Offsets={"; @@ -1353,19 +1364,19 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRUse::dump() const { print(errs()); errs() << '\n'; } -#endif static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, + HasBaseReg, Scale, AccessTy.AddrSpace); case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to @@ -1412,7 +1423,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Check for overflow. @@ -1433,7 +1444,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { // For the purpose of isAMCompletelyFolded either having a canonical formula // or a scale not equal to zero is correct. @@ -1447,11 +1458,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } -/// isLegalUse - Test whether we know how to expand the current formula. +/// Test whether we know how to expand the current formula. static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1463,8 +1474,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, } static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - const Formula &F) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } @@ -1490,14 +1501,12 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { // Check the scaling factor cost with both the min and max offsets. - int ScaleCostMinOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MinOffset, - F.HasBaseReg, F.Scale); - int ScaleCostMaxOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MaxOffset, - F.HasBaseReg, F.Scale); + int ScaleCostMinOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); + int ScaleCostMaxOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && "Legal addressing mode has an illegal cost!"); @@ -1515,7 +1524,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1539,7 +1548,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, ScalarEvolution &SE, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, - Type *AccessTy, const SCEV *S, bool HasBaseReg) { + MemAccessTy AccessTy, const SCEV *S, + bool HasBaseReg) { // Fast-path: zero is always foldable. if (S->isZero()) return true; @@ -1564,9 +1574,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, namespace { -/// IVInc - An individual increment in a Chain of IV increments. -/// Relate an IV user to an expression that computes the IV it uses from the IV -/// used by the previous link in the Chain. +/// An individual increment in a Chain of IV increments. Relate an IV user to +/// an expression that computes the IV it uses from the IV used by the previous +/// link in the Chain. /// /// For the head of a chain, IncExpr holds the absolute SCEV expression for the /// original IVOperand. The head of the chain's IVOperand is only valid during @@ -1582,8 +1592,8 @@ struct IVInc { UserInst(U), IVOperand(O), IncExpr(E) {} }; -// IVChain - The list of IV increments in program order. -// We typically add the head of a chain without finding subsequent links. +// The list of IV increments in program order. We typically add the head of a +// chain without finding subsequent links. struct IVChain { SmallVector<IVInc,1> Incs; const SCEV *ExprBase; @@ -1595,7 +1605,7 @@ struct IVChain { typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; - // begin - return the first increment in the chain. + // Return the first increment in the chain. const_iterator begin() const { assert(!Incs.empty()); return std::next(Incs.begin()); @@ -1604,32 +1614,30 @@ struct IVChain { return Incs.end(); } - // hasIncs - Returns true if this chain contains any increments. + // Returns true if this chain contains any increments. bool hasIncs() const { return Incs.size() >= 2; } - // add - Add an IVInc to the end of this chain. + // Add an IVInc to the end of this chain. void add(const IVInc &X) { Incs.push_back(X); } - // tailUserInst - Returns the last UserInst in the chain. + // Returns the last UserInst in the chain. Instruction *tailUserInst() const { return Incs.back().UserInst; } - // isProfitableIncrement - Returns true if IncExpr can be profitably added to - // this chain. + // Returns true if IncExpr can be profitably added to this chain. bool isProfitableIncrement(const SCEV *OperExpr, const SCEV *IncExpr, ScalarEvolution&); }; -/// ChainUsers - Helper for CollectChains to track multiple IV increment uses. -/// Distinguish between FarUsers that definitely cross IV increments and -/// NearUsers that may be used between IV increments. +/// Helper for CollectChains to track multiple IV increment uses. Distinguish +/// between FarUsers that definitely cross IV increments and NearUsers that may +/// be used between IV increments. struct ChainUsers { SmallPtrSet<Instruction*, 4> FarUsers; SmallPtrSet<Instruction*, 4> NearUsers; }; -/// LSRInstance - This class holds state for the main loop strength reduction -/// logic. +/// This class holds state for the main loop strength reduction logic. class LSRInstance { IVUsers &IU; ScalarEvolution &SE; @@ -1639,25 +1647,25 @@ class LSRInstance { Loop *const L; bool Changed; - /// IVIncInsertPos - This is the insert position that the current loop's - /// induction variable increment should be placed. In simple loops, this is - /// the latch block's terminator. But in more complicated cases, this is a - /// position which will dominate all the in-loop post-increment users. + /// This is the insert position that the current loop's induction variable + /// increment should be placed. In simple loops, this is the latch block's + /// terminator. But in more complicated cases, this is a position which will + /// dominate all the in-loop post-increment users. Instruction *IVIncInsertPos; - /// Factors - Interesting factors between use strides. + /// Interesting factors between use strides. SmallSetVector<int64_t, 8> Factors; - /// Types - Interesting use types, to facilitate truncation reuse. + /// Interesting use types, to facilitate truncation reuse. SmallSetVector<Type *, 4> Types; - /// Fixups - The list of operands which are to be replaced. + /// The list of operands which are to be replaced. SmallVector<LSRFixup, 16> Fixups; - /// Uses - The list of interesting uses. + /// The list of interesting uses. SmallVector<LSRUse, 16> Uses; - /// RegUses - Track which uses use which register candidates. + /// Track which uses use which register candidates. RegUseTracker RegUses; // Limit the number of chains to avoid quadratic behavior. We don't expect to @@ -1665,10 +1673,10 @@ class LSRInstance { // back to normal LSR behavior for those uses. static const unsigned MaxChains = 8; - /// IVChainVec - IV users can form a chain of IV increments. + /// IV users can form a chain of IV increments. SmallVector<IVChain, MaxChains> IVChainVec; - /// IVIncSet - IV users that belong to profitable IVChains. + /// IV users that belong to profitable IVChains. SmallPtrSet<Use*, MaxChains> IVIncSet; void OptimizeShadowIV(); @@ -1696,11 +1704,10 @@ class LSRInstance { UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy); + LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair<size_t, int64_t> getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - Type *AccessTy); + std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -1769,18 +1776,16 @@ class LSRInstance { void RewriteForPHI(PHINode *PN, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const; + SmallVectorImpl<WeakVH> &DeadInsts) const; void Rewrite(const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const; - void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, - Pass *P); + SmallVectorImpl<WeakVH> &DeadInsts) const; + void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution); public: - LSRInstance(Loop *L, Pass *P); + LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, + LoopInfo &LI, const TargetTransformInfo &TTI); bool getChanged() const { return Changed; } @@ -1793,8 +1798,8 @@ public: } -/// OptimizeShadowIV - If IV is used in a int-to-float cast -/// inside the loop then try to eliminate the cast operation. +/// If IV is used in a int-to-float cast inside the loop then try to eliminate +/// the cast operation. void LSRInstance::OptimizeShadowIV() { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) @@ -1902,9 +1907,8 @@ void LSRInstance::OptimizeShadowIV() { } } -/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, -/// set the IV user and stride information and return true, otherwise return -/// false. +/// If Cond has an operand that is an expression of an IV, set the IV user and +/// stride information and return true, otherwise return false. bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { for (IVStrideUse &U : IU) if (U.getUser() == Cond) { @@ -1917,8 +1921,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { return false; } -/// OptimizeMax - Rewrite the loop's terminating condition if it uses -/// a max computation. +/// Rewrite the loop's terminating condition if it uses a max computation. /// /// This is a narrow solution to a specific, but acute, problem. For loops /// like this: @@ -2076,8 +2079,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { return NewCond; } -/// OptimizeLoopTermCond - Change loop terminating condition to use the -/// postinc iv when possible. +/// Change loop terminating condition to use the postinc iv when possible. void LSRInstance::OptimizeLoopTermCond() { SmallPtrSet<Instruction *, 4> PostIncs; @@ -2152,16 +2154,18 @@ LSRInstance::OptimizeLoopTermCond() { C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - Type *AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(UI->getUser()); int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; } } @@ -2180,7 +2184,7 @@ LSRInstance::OptimizeLoopTermCond() { ICmpInst *OldCond = Cond; Cond = cast<ICmpInst>(Cond->clone()); Cond->setName(L->getHeader()->getName() + ".termcond"); - ExitingBlock->getInstList().insert(TermBr, Cond); + ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond); // Clone the IVUse, as the old use still exists! CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); @@ -2213,15 +2217,14 @@ LSRInstance::OptimizeLoopTermCond() { } } -/// reconcileNewOffset - Determine if the given use can accommodate a fixup -/// at the given offset and other details. If so, update the use and -/// return true. -bool -LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy) { +/// Determine if the given use can accommodate a fixup at the given offset and +/// other details. If so, update the use and return true. +bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, + bool HasBaseReg, LSRUse::KindType Kind, + MemAccessTy AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; - Type *NewAccessTy = AccessTy; + MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to // something conservative, however this can pessimize in the case that one of @@ -2232,8 +2235,10 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // Check for a mismatched access type, and fall back conservatively as needed. // TODO: Be less conservative when the type is similar and can use the same // addressing modes. - if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) - NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + if (Kind == LSRUse::Address) { + if (AccessTy != LU.AccessTy) + NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext()); + } // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { @@ -2257,12 +2262,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, return true; } -/// getUse - Return an LSRUse index and an offset value for a fixup which -/// needs the given expression, with the given kind and optional access type. -/// Either reuse an existing use or create a new one, as needed. -std::pair<size_t, int64_t> -LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, Type *AccessTy) { +/// Return an LSRUse index and an offset value for a fixup which needs the given +/// expression, with the given kind and optional access type. Either reuse an +/// existing use or create a new one, as needed. +std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; int64_t Offset = ExtractImmediate(Expr, SE); @@ -2300,18 +2305,18 @@ LSRInstance::getUse(const SCEV *&Expr, return std::make_pair(LUIdx, Offset); } -/// DeleteUse - Delete the given use from the Uses list. +/// Delete the given use from the Uses list. void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { if (&LU != &Uses.back()) std::swap(LU, Uses.back()); Uses.pop_back(); // Update RegUses. - RegUses.SwapAndDropUse(LUIdx, Uses.size()); + RegUses.swapAndDropUse(LUIdx, Uses.size()); } -/// FindUseWithFormula - Look for a use distinct from OrigLU which is has -/// a formula that has the same registers as the given formula. +/// Look for a use distinct from OrigLU which is has a formula that has the same +/// registers as the given formula. LSRUse * LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, const LSRUse &OrigLU) { @@ -2396,14 +2401,14 @@ void LSRInstance::CollectInterestingTypesAndFactors() { if (const SCEVConstant *Factor = dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, SE, true))) { - if (Factor->getValue()->getValue().getMinSignedBits() <= 64) - Factors.insert(Factor->getValue()->getValue().getSExtValue()); + if (Factor->getAPInt().getMinSignedBits() <= 64) + Factors.insert(Factor->getAPInt().getSExtValue()); } else if (const SCEVConstant *Factor = dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, NewStride, SE, true))) { - if (Factor->getValue()->getValue().getMinSignedBits() <= 64) - Factors.insert(Factor->getValue()->getValue().getSExtValue()); + if (Factor->getAPInt().getMinSignedBits() <= 64) + Factors.insert(Factor->getAPInt().getSExtValue()); } } @@ -2415,9 +2420,9 @@ void LSRInstance::CollectInterestingTypesAndFactors() { DEBUG(print_factors_and_types(dbgs())); } -/// findIVOperand - Helper for CollectChains that finds an IV operand (computed -/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped -/// Instructions to IVStrideUses, we could partially skip this. +/// Helper for CollectChains that finds an IV operand (computed by an AddRec in +/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to +/// IVStrideUses, we could partially skip this. static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE) { @@ -2436,29 +2441,28 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE, return OI; } -/// getWideOperand - IVChain logic must consistenctly peek base TruncInst -/// operands, so wrap it in a convenient helper. +/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in +/// a convenient helper. static Value *getWideOperand(Value *Oper) { if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) return Trunc->getOperand(0); return Oper; } -/// isCompatibleIVType - Return true if we allow an IV chain to include both -/// types. +/// Return true if we allow an IV chain to include both types. static bool isCompatibleIVType(Value *LVal, Value *RVal) { Type *LType = LVal->getType(); Type *RType = RVal->getType(); return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); } -/// getExprBase - Return an approximation of this SCEV expression's "base", or -/// NULL for any constant. Returning the expression itself is -/// conservative. Returning a deeper subexpression is more precise and valid as -/// long as it isn't less complex than another subexpression. For expressions -/// involving multiple unscaled values, we need to return the pointer-type -/// SCEVUnknown. This avoids forming chains across objects, such as: -/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. +/// Return an approximation of this SCEV expression's "base", or NULL for any +/// constant. Returning the expression itself is conservative. Returning a +/// deeper subexpression is more precise and valid as long as it isn't less +/// complex than another subexpression. For expressions involving multiple +/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids +/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i], +/// IVInc==b-a. /// /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost /// SCEVUnknown, we simply return the rightmost SCEV operand. @@ -2601,8 +2605,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, return cost < 0; } -/// ChainInstruction - Add this IV user to an existing chain or make it the head -/// of a new chain. +/// Add this IV user to an existing chain or make it the head of a new chain. void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, SmallVectorImpl<ChainUsers> &ChainUsersVec) { // When IVs are used as types of varying widths, they are generally converted @@ -2714,7 +2717,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); } -/// CollectChains - Populate the vector of Chains. +/// Populate the vector of Chains. /// /// This decreases ILP at the architecture level. Targets with ample registers, /// multiple memory ports, and no register renaming probably don't want @@ -2755,19 +2758,19 @@ void LSRInstance::CollectChains() { for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); I != E; ++I) { // Skip instructions that weren't seen by IVUsers analysis. - if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) + if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I)) continue; // Ignore users that are part of a SCEV expression. This way we only // consider leaf IV Users. This effectively rediscovers a portion of // IVUsers analysis but in program order this time. - if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) + if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I))) continue; // Remove this instruction from any NearUsers set it may be in. for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); ChainIdx < NChains; ++ChainIdx) { - ChainUsersVec[ChainIdx].NearUsers.erase(I); + ChainUsersVec[ChainIdx].NearUsers.erase(&*I); } // Search for operands that can be chained. SmallPtrSet<Instruction*, 4> UniqueOperands; @@ -2776,7 +2779,7 @@ void LSRInstance::CollectChains() { while (IVOpIter != IVOpEnd) { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); if (UniqueOperands.insert(IVOpInst).second) - ChainInstruction(I, IVOpInst, ChainUsersVec); + ChainInstruction(&*I, IVOpInst, ChainUsersVec); IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } } // Continue walking down the instructions. @@ -2828,20 +2831,20 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, if (!IncConst || !isAddressUse(UserInst, Operand)) return false; - if (IncConst->getValue()->getValue().getMinSignedBits() > 64) + if (IncConst->getAPInt().getMinSignedBits() > 64) return false; + MemAccessTy AccessTy = getAccessType(UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); - if (!isAlwaysFoldable(TTI, LSRUse::Address, - getAccessType(UserInst), /*BaseGV=*/ nullptr, - IncOffset, /*HaseBaseReg=*/ false)) + if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, + IncOffset, /*HaseBaseReg=*/false)) return false; return true; } -/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to -/// materialize the IV user's operand from the previous IV user's operand. +/// Generate an add or subtract for each IVInc in a chain to materialize the IV +/// user's operand from the previous IV user's operand. void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) { // Find the new IVOperand for the head of the chain. It may have been replaced @@ -2961,7 +2964,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = U.getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - Type *AccessTy = nullptr; + MemAccessTy AccessTy; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -3027,9 +3030,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { DEBUG(print_fixups(dbgs())); } -/// InsertInitialFormula - Insert a formula for the given expression into -/// the given use, separating out loop-variant portions from loop-invariant -/// and loop-computable portions. +/// Insert a formula for the given expression into the given use, separating out +/// loop-variant portions from loop-invariant and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { // Mark uses whose expressions cannot be expanded. @@ -3037,13 +3039,13 @@ LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { LU.RigidFormula = true; Formula F; - F.InitialMatch(S, L, SE); + F.initialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } -/// InsertSupplementalFormula - Insert a simple single-register formula for -/// the given expression into the given use. +/// Insert a simple single-register formula for the given expression into the +/// given use. void LSRInstance::InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { @@ -3054,17 +3056,16 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S, assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; } -/// CountRegisters - Note which registers are used by the given formula, -/// updating RegUses. +/// Note which registers are used by the given formula, updating RegUses. void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { if (F.ScaledReg) - RegUses.CountRegister(F.ScaledReg, LUIdx); + RegUses.countRegister(F.ScaledReg, LUIdx); for (const SCEV *BaseReg : F.BaseRegs) - RegUses.CountRegister(BaseReg, LUIdx); + RegUses.countRegister(BaseReg, LUIdx); } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise. bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { // Do not insert formula that we will not be able to expand. assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && @@ -3076,9 +3077,9 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } -/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of -/// loop-invariant values which we're tracking. These other uses will pin these -/// values in registers, making them less profitable for elimination. +/// Check for other uses of loop-invariant values which we're tracking. These +/// other uses will pin these values in registers, making them less profitable +/// for elimination. /// TODO: This currently misses non-constant addrec step registers. /// TODO: Should this give more weight to users inside the loop? void @@ -3124,6 +3125,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { PHINode::getIncomingValueNumForOperand(U.getOperandNo())); if (!DT.dominates(L->getHeader(), UseBB)) continue; + // Don't bother if the instruction is in a BB which ends in an EHPad. + if (UseBB->getTerminator()->isEHPad()) + continue; // Ignore uses which are part of other SCEV expressions, to avoid // analyzing them multiple times. if (SE.isSCEVable(UserInst->getType())) { @@ -3148,7 +3152,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = const_cast<Instruction *>(UserInst); LF.OperandValToReplace = U; - std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); + std::pair<size_t, int64_t> P = getUse( + S, LSRUse::Basic, MemAccessTy()); LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; @@ -3165,8 +3170,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { } } -/// CollectSubexprs - Split S into subexpressions which can be pulled out into -/// separate registers. If C is non-null, multiply each subexpression by C. +/// Split S into subexpressions which can be pulled out into separate +/// registers. If C is non-null, multiply each subexpression by C. /// /// Return remainder expression after factoring the subexpressions captured by /// Ops. If Ops is complete, return NULL. @@ -3300,7 +3305,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the // formula accordingly. - F.Canonicalize(); + F.canonicalize(); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -3309,8 +3314,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, } } -/// GenerateReassociations - Split out subexpressions from adds and the bases of -/// addrecs. +/// Split out subexpressions from adds and the bases of addrecs. void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth) { assert(Base.isCanonical() && "Input must be in the canonical form"); @@ -3326,8 +3330,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, /* Idx */ -1, /* IsScaledReg */ true); } -/// GenerateCombinations - Generate a formula consisting of all of the -/// loop-dominating registers added into a single register. +/// Generate a formula consisting of all of the loop-dominating registers added +/// into a single register. void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. @@ -3336,7 +3340,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before // processing the formula. - Base.Unscale(); + Base.unscale(); Formula F = Base; F.BaseRegs.clear(); SmallVector<const SCEV *, 4> Ops; @@ -3354,7 +3358,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); - F.Canonicalize(); + F.canonicalize(); (void)InsertFormula(LU, LUIdx, F); } } @@ -3379,7 +3383,7 @@ void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, (void)InsertFormula(LU, LUIdx, F); } -/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. +/// Generate reuse formulae using symbolic offsets. void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. @@ -3410,8 +3414,8 @@ void LSRInstance::GenerateConstantOffsetsImpl( F.Scale = 0; F.ScaledReg = nullptr; } else - F.DeleteBaseReg(F.BaseRegs[Idx]); - F.Canonicalize(); + F.deleteBaseReg(F.BaseRegs[Idx]); + F.canonicalize(); } else if (IsScaledReg) F.ScaledReg = NewG; else @@ -3452,8 +3456,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, /* IsScaledReg */ true); } -/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up -/// the comparison. For example, x == y -> x*c == y*c. +/// For ICmpZero, check to see if we can scale up the comparison. For example, x +/// == y -> x*c == y*c. void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (LU.Kind != LSRUse::ICmpZero) return; @@ -3538,8 +3542,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, } } -/// GenerateScales - Generate stride factor reuse formulae by making use of -/// scaled-offset address modes, for example. +/// Generate stride factor reuse formulae by making use of scaled-offset address +/// modes, for example. void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // Determine the integer type for the base formula. Type *IntTy = Base.getType(); @@ -3547,10 +3551,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // If this Formula already has a scaled register, we can't add another one. // Try to unscale the formula to generate a better scale. - if (Base.Scale != 0 && !Base.Unscale()) + if (Base.Scale != 0 && !Base.unscale()) return; - assert(Base.Scale == 0 && "Unscale did not did its job!"); + assert(Base.Scale == 0 && "unscale did not did its job!"); // Check each interesting stride. for (int64_t Factor : Factors) { @@ -3587,7 +3591,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: This could be optimized to avoid all the copying. Formula F = Base; F.ScaledReg = Quotient; - F.DeleteBaseReg(F.BaseRegs[i]); + F.deleteBaseReg(F.BaseRegs[i]); // The canonical representation of 1*reg is reg, which is already in // Base. In that case, do not try to insert the formula, it will be // rejected anyway. @@ -3599,7 +3603,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { } } -/// GenerateTruncates - Generate reuse formulae from different IV types. +/// Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { // Don't bother truncating symbolic values. if (Base.BaseGV) return; @@ -3629,9 +3633,9 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { namespace { -/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to -/// defer modifications so that the search phase doesn't have to worry about -/// the data structures moving underneath it. +/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer +/// modifications so that the search phase doesn't have to worry about the data +/// structures moving underneath it. struct WorkItem { size_t LUIdx; int64_t Imm; @@ -3651,14 +3655,13 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void WorkItem::dump() const { print(errs()); errs() << '\n'; } -#endif -/// GenerateCrossUseConstantOffsets - Look for registers which are a constant -/// distance apart and try to form reuse opportunities between them. +/// Look for registers which are a constant distance apart and try to form reuse +/// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. typedef std::map<int64_t, const SCEV *> ImmMapTy; @@ -3751,7 +3754,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // very similar but slightly different. Investigate if they // could be merged. That way, we would not have to unscale the // Formula. - F.Unscale(); + F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; @@ -3770,14 +3773,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) - if (C->getValue()->isNegative() != - (NewF.BaseOffset < 0) && - (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) - .ule(std::abs(NewF.BaseOffset))) + if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && + (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) + .ule(std::abs(NewF.BaseOffset))) continue; // OK, looks good. - NewF.Canonicalize(); + NewF.canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3801,15 +3803,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) - if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( - std::abs(NewF.BaseOffset)) && - (C->getValue()->getValue() + - NewF.BaseOffset).countTrailingZeros() >= - countTrailingZeros<uint64_t>(NewF.BaseOffset)) + if ((C->getAPInt() + NewF.BaseOffset) + .abs() + .slt(std::abs(NewF.BaseOffset)) && + (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >= + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. - NewF.Canonicalize(); + NewF.canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -3819,7 +3821,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { } } -/// GenerateAllReuseFormulae - Generate formulae for each use. +/// Generate formulae for each use. void LSRInstance::GenerateAllReuseFormulae() { // This is split into multiple loops so that hasRegsUsedByUsesOtherThan @@ -3959,10 +3961,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // This is a rough guess that seems to work fairly well. static const size_t ComplexityLimit = UINT16_MAX; -/// EstimateSearchSpaceComplexity - Estimate the worst-case number of -/// solutions the solver might have to consider. It almost never considers -/// this many solutions because it prune the search space, but the pruning -/// isn't always sufficient. +/// Estimate the worst-case number of solutions the solver might have to +/// consider. It almost never considers this many solutions because it prune the +/// search space, but the pruning isn't always sufficient. size_t LSRInstance::EstimateSearchSpaceComplexity() const { size_t Power = 1; for (const LSRUse &LU : Uses) { @@ -3978,10 +3979,9 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const { return Power; } -/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset -/// of the registers of another formula, it won't help reduce register -/// pressure (though it may not necessarily hurt register pressure); remove -/// it to simplify the system. +/// When one formula uses a superset of the registers of another formula, it +/// won't help reduce register pressure (though it may not necessarily hurt +/// register pressure); remove it to simplify the system. void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { DEBUG(dbgs() << "The search space is too complex.\n"); @@ -4042,9 +4042,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { } } -/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers -/// for expressions like A, A+1, A+2, etc., allocate a single register for -/// them. +/// When there are many registers for expressions like A, A+1, A+2, etc., +/// allocate a single register for them. void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; @@ -4121,8 +4120,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } -/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call -/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that /// we've done more filtering, as it may be able to find more formulae to /// eliminate. void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ @@ -4139,9 +4137,9 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ } } -/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely -/// to be profitable, and then in any use which has any reference to that -/// register, delete all formulae which do not reference that register. +/// Pick a register which seems likely to be profitable, and then in any use +/// which has any reference to that register, delete all formulae which do not +/// reference that register. void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { // With all other options exhausted, loop until the system is simple // enough to handle. @@ -4202,10 +4200,10 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { } } -/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of -/// formulae to choose from, use some rough heuristics to prune down the number -/// of formulae. This keeps the main solver from taking an extraordinary amount -/// of time in some worst-case scenarios. +/// If there are an extraordinary number of formulae to choose from, use some +/// rough heuristics to prune down the number of formulae. This keeps the main +/// solver from taking an extraordinary amount of time in some worst-case +/// scenarios. void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); @@ -4213,7 +4211,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByPickingWinnerRegs(); } -/// SolveRecurse - This is the recursive solver. +/// This is the recursive solver. void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, Cost &SolutionCost, SmallVectorImpl<const Formula *> &Workspace, @@ -4291,8 +4289,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, } } -/// Solve - Choose one formula from each use. Return the results in the given -/// Solution vector. +/// Choose one formula from each use. Return the results in the given Solution +/// vector. void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SmallVector<const Formula *, 8> Workspace; Cost SolutionCost; @@ -4326,10 +4324,9 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { assert(Solution.size() == Uses.size() && "Malformed solution!"); } -/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up -/// the dominator tree far as we can go while still being dominated by the -/// input positions. This helps canonicalize the insert position, which -/// encourages sharing. +/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as +/// we can go while still being dominated by the input positions. This helps +/// canonicalize the insert position, which encourages sharing. BasicBlock::iterator LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, const SmallVectorImpl<Instruction *> &Inputs) @@ -4365,21 +4362,21 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || !DT.dominates(Inst, BetterPos))) - BetterPos = std::next(BasicBlock::iterator(Inst)); + BetterPos = &*std::next(BasicBlock::iterator(Inst)); } if (!AllDominate) break; if (BetterPos) - IP = BetterPos; + IP = BetterPos->getIterator(); else - IP = Tentative; + IP = Tentative->getIterator(); } return IP; } -/// AdjustInsertPositionForExpand - Determine an input position which will be -/// dominated by the operands and which will dominate the result. +/// Determine an input position which will be dominated by the operands and +/// which will dominate the result. BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, const LSRFixup &LF, @@ -4417,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, } } - assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) + assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() && !isa<DbgInfoIntrinsic>(LowestIP) && "Insertion point must be a normal instruction"); @@ -4429,7 +4426,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, while (isa<PHINode>(IP)) ++IP; // Ignore landingpad instructions. - while (isa<LandingPadInst>(IP)) ++IP; + while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP; // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -4437,13 +4434,14 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, // Set IP below instructions recently inserted by SCEVExpander. This keeps the // IP consistent across expansions and allows the previously inserted // instructions to be reused by subsequent expansion. - while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; + while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP) + ++IP; return IP; } -/// Expand - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"). +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"). Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, @@ -4487,7 +4485,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP))); } // Expand the ScaledReg portion. @@ -4505,14 +4503,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand ScaleReg as if it was part of the base regs. if (F.Scale == 1) Ops.push_back( - SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP))); else { // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP); } } else { // Otherwise just expand the scaled register and an explicit scale, @@ -4522,11 +4520,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Unless the addressing mode will not be folded. if (!Ops.empty() && LU.Kind == LSRUse::Address && isAMCompletelyFolded(TTI, LU, F)) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)); if (F.Scale != 1) ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); @@ -4538,7 +4536,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4548,7 +4546,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Flush the operand list to suppress SCEVExpander hoisting of both folded and // unfolded offsets. LSR assumes they both live next to their uses. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4584,7 +4582,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); - Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP); // We're done expanding now, so reset the rewriter. Rewriter.clearPostInc(); @@ -4626,15 +4624,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, return FullV; } -/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use -/// of their operands effectively happens in their predecessor blocks, so the -/// expression may need to be expanded in multiple places. +/// Helper for Rewrite. PHI nodes are special because the use of their operands +/// effectively happens in their predecessor blocks, so the expression may need +/// to be expanded in multiple places. void LSRInstance::RewriteForPHI(PHINode *PN, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const { + SmallVectorImpl<WeakVH> &DeadInsts) const { DenseMap<BasicBlock *, Value *> Inserted; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == LF.OperandValToReplace) { @@ -4658,8 +4655,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, .setDontDeleteUselessPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, - /*AliasAnalysis*/ nullptr, &DT, &LI); + SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI); NewBB = NewBBs[0]; } // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4685,7 +4681,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN, if (!Pair.second) PN->setIncomingValue(i, Pair.first->second); else { - Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); + Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(), + Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4702,20 +4699,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN, } } -/// Rewrite - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"), and update the UserInst to reference -/// the newly expanded value. +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"), and update the UserInst to reference the newly +/// expanded value. void LSRInstance::Rewrite(const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const { + SmallVectorImpl<WeakVH> &DeadInsts) const { // First, find an insertion point that dominates UserInst. For PHI nodes, // find the nearest block which dominates all the relevant uses. if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { - RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); + RewriteForPHI(PN, LF, F, Rewriter, DeadInsts); } else { - Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); + Value *FullV = + Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4740,11 +4737,10 @@ void LSRInstance::Rewrite(const LSRFixup &LF, DeadInsts.emplace_back(LF.OperandValToReplace); } -/// ImplementSolution - Rewrite all the fixup locations with new values, -/// following the chosen solution. -void -LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, - Pass *P) { +/// Rewrite all the fixup locations with new values, following the chosen +/// solution. +void LSRInstance::ImplementSolution( + const SmallVectorImpl<const Formula *> &Solution) { // Keep track of instructions we may have made dead, so that // we can remove them after we are done working. SmallVector<WeakVH, 16> DeadInsts; @@ -4766,7 +4762,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // Expand the new value definitions and update the users. for (const LSRFixup &Fixup : Fixups) { - Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); + Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts); Changed = true; } @@ -4782,13 +4778,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(Loop *L, Pass *P) - : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), - LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), - TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *L->getHeader()->getParent())), - L(L), Changed(false), IVIncInsertPos(nullptr) { +LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, + DominatorTree &DT, LoopInfo &LI, + const TargetTransformInfo &TTI) + : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false), + IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4879,7 +4873,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) #endif // Now that we've decided what we want, make it so. - ImplementSolution(Solution, P); + ImplementSolution(Solution); } void LSRInstance::print_factors_and_types(raw_ostream &OS) const { @@ -4931,11 +4925,10 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRInstance::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { @@ -4956,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -4982,8 +4975,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); // Requiring LoopSimplify a second time here prevents IVUsers from running // twice, since LoopSimplify was invalidated by running ScalarEvolution. AU.addRequiredID(LoopSimplifyID); @@ -4996,17 +4989,24 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { if (skipOptnoneFunction(L)) return false; + auto &IU = getAnalysis<IVUsers>(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()); bool Changed = false; // Run the main LSR transformation. - Changed |= LSRInstance(L, this).getChanged(); + Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr"); + SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL, + "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index d78db6c369b3..56ae5c010411 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -130,27 +131,29 @@ namespace { bool UserAllowPartial; bool UserRuntime; - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. // For now, recreate dom info, if loop is unrolled. AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } // Fill in the UnrollingPreferences parameter with values from the @@ -186,7 +189,7 @@ namespace { // total unrolled size. Parameters Threshold and PartialThreshold // are set to the maximum unrolled size for fully and partially // unrolled loops respectively. - void selectThresholds(const Loop *L, bool HasPragma, + void selectThresholds(const Loop *L, bool UsePragmaThreshold, const TargetTransformInfo::UnrollingPreferences &UP, unsigned &Threshold, unsigned &PartialThreshold, unsigned &PercentDynamicCostSavedThreshold, @@ -207,12 +210,13 @@ namespace { : UP.DynamicCostSavingsDiscount; if (!UserThreshold && + // FIXME: Use Function::optForSize(). L->getHeader()->getParent()->hasFnAttribute( Attribute::OptimizeForSize)) { Threshold = UP.OptSizeThreshold; PartialThreshold = UP.PartialOptSizeThreshold; } - if (HasPragma) { + if (UsePragmaThreshold) { // If the loop has an unrolling pragma, we want to be more // aggressive with unrolling limits. Set thresholds to at // least the PragmaTheshold value which is larger than the @@ -235,10 +239,11 @@ char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, @@ -278,8 +283,8 @@ class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> { public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap<Value *, Constant *> &SimplifiedValues, - const Loop *L, ScalarEvolution &SE) - : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) { + ScalarEvolution &SE) + : SimplifiedValues(SimplifiedValues), SE(SE) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -295,13 +300,6 @@ private: /// results saved. DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; - /// \brief Number of currently simulated iteration. - /// - /// If an expression is ConstAddress+Constant, then the Constant is - /// Start + Iteration*Step, where Start and Step could be obtained from - /// SCEVGEPCache. - unsigned Iteration; - /// \brief SCEV expression corresponding to number of currently simulated /// iteration. const SCEV *IterationNumber; @@ -316,7 +314,6 @@ private: /// post-unrolling. DenseMap<Value *, Constant *> &SimplifiedValues; - const Loop *L; ScalarEvolution &SE; /// \brief Try to simplify instruction \param I using its SCEV expression. @@ -368,11 +365,9 @@ private: return simplifyInstWithSCEV(&I); } - /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt. - /// Try to simplify binary operator I. /// - /// TODO: Probaly it's worth to hoist the code for estimating the + /// TODO: Probably it's worth to hoist the code for estimating the /// simplifications effects to a separate class, since we have a very similar /// code in InlineCost already. bool visitBinaryOperator(BinaryOperator &I) { @@ -412,7 +407,7 @@ private: auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base); // We're only interested in loads that can be completely folded to a // constant. - if (!GV || !GV->hasInitializer()) + if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant()) return false; ConstantDataSequential *CDS = @@ -420,6 +415,12 @@ private: if (!CDS) return false; + // We might have a vector load from an array. FIXME: for now we just bail + // out in this case, but we should be able to resolve and simplify such + // loads. + if(!CDS->isElementTypeCompatible(I.getType())) + return false; + int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && "Unexpectedly large index value."); @@ -436,6 +437,59 @@ private: return true; } + + bool visitCastInst(CastInst &I) { + // Propagate constants through casts. + Constant *COp = dyn_cast<Constant>(I.getOperand(0)); + if (!COp) + COp = SimplifiedValues.lookup(I.getOperand(0)); + if (COp) + if (Constant *C = + ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { + SimplifiedValues[&I] = C; + return true; + } + + return Base::visitCastInst(I); + } + + bool visitCmpInst(CmpInst &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + // First try to handle simplified comparisons. + if (!isa<Constant>(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa<Constant>(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; + + if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) { + auto SimplifiedLHS = SimplifiedAddresses.find(LHS); + if (SimplifiedLHS != SimplifiedAddresses.end()) { + auto SimplifiedRHS = SimplifiedAddresses.find(RHS); + if (SimplifiedRHS != SimplifiedAddresses.end()) { + SimplifiedAddress &LHSAddr = SimplifiedLHS->second; + SimplifiedAddress &RHSAddr = SimplifiedRHS->second; + if (LHSAddr.Base == RHSAddr.Base) { + LHS = LHSAddr.Offset; + RHS = RHSAddr.Offset; + } + } + } + } + + if (Constant *CLHS = dyn_cast<Constant>(LHS)) { + if (Constant *CRHS = dyn_cast<Constant>(RHS)) { + if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { + SimplifiedValues[&I] = C; + return true; + } + } + } + + return Base::visitCmpInst(I); + } }; } // namespace @@ -443,11 +497,11 @@ private: namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. - unsigned UnrolledCost; + int UnrolledCost; /// \brief The estimated dynamic cost of executing the instructions in the /// rolled form. - unsigned RolledDynamicCost; + int RolledDynamicCost; }; } @@ -464,10 +518,10 @@ struct EstimatedUnrollCost { /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. -Optional<EstimatedUnrollCost> -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { +static Optional<EstimatedUnrollCost> +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -481,24 +535,61 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; + SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. - unsigned UnrolledCost = 0; + int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. - unsigned RolledDynamicCost = 0; + int RolledDynamicCost = 0; + + // Ensure that we don't violate the loop structure invariants relied on by + // this analysis. + assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); + assert(L->isLCSSAForm(DT) && + "Must have loops in LCSSA form to track live-out values."); + + DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); + + // Prepare for the iteration by collecting any simplified entry or backedge + // inputs. + for (Instruction &I : *L->getHeader()) { + auto *PHI = dyn_cast<PHINode>(&I); + if (!PHI) + break; + + // The loop header PHI nodes must have exactly two input: one from the + // loop preheader and one from the loop latch. + assert( + PHI->getNumIncomingValues() == 2 && + "Must have an incoming value only for the preheader and the latch."); + + Value *V = PHI->getIncomingValueForBlock( + Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Constant *C = dyn_cast<Constant>(V); + if (Iteration != 0 && !C) + C = SimplifiedValues.lookup(V); + if (C) + SimplifiedInputValues.push_back({PHI, C}); + } + + // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); + while (!SimplifiedInputValues.empty()) + SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -510,21 +601,67 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - unsigned InstCost = TTI.getUserCost(&I); + int InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; + else { + DEBUG(dbgs() << " " << I + << " would be simplified if loop is unrolled.\n"); + (void)0; + } // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. - if (UnrolledCost > MaxUnrolledLoopSize) + if (UnrolledCost > MaxUnrolledLoopSize) { + DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" + << " UnrolledCost: " << UnrolledCost + << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize + << "\n"); return None; + } + } + + TerminatorInst *TI = BB->getTerminator(); + + // Add in the live successors by first checking whether we have terminator + // that may be simplified based on the values simplified by this call. + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional()) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(BI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa<UndefValue>(SimpleCond)) + Succ = BI->getSuccessor(0); + else + Succ = BI->getSuccessor( + cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(SI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa<UndefValue>(SimpleCond)) + Succ = SI->getSuccessor(0); + else + Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) + .getCaseSuccessor(); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } } // Add BB's successors to the worklist. @@ -535,9 +672,15 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) + if (UnrolledCost == RolledDynamicCost) { + DEBUG(dbgs() << " No opportunities found.. exiting.\n" + << " UnrolledCost: " << UnrolledCost << "\n"); return None; + } } + DEBUG(dbgs() << "Analysis finished:\n" + << "UnrolledCost: " << UnrolledCost << ", " + << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; } @@ -583,6 +726,12 @@ static bool HasUnrollFullPragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } +// Returns true if the loop has an unroll(enable) pragma. This metadata is used +// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives. +static bool HasUnrollEnablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); +} + // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); @@ -708,7 +857,7 @@ unsigned LoopUnroll::selectUnrollCount( unsigned Count = UserCount ? CurrentCount : 0; // If there is no user-specified count, unroll pragmas have the next - // highest precendence. + // highest precedence. if (Count == 0) { if (PragmaCount) { Count = PragmaCount; @@ -737,17 +886,19 @@ unsigned LoopUnroll::selectUnrollCount( return Count; } -bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; Function &F = *L->getHeader()->getParent(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -757,8 +908,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } bool PragmaFullUnroll = HasUnrollFullPragma(L); + bool PragmaEnableUnroll = HasUnrollEnablePragma(L); unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = PragmaFullUnroll || PragmaCount > 0; + bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; getUnrollingPreferences(L, TTI, UP); @@ -806,7 +958,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { unsigned Threshold, PartialThreshold; unsigned PercentDynamicCostSavedThreshold; unsigned DynamicCostSavingsDiscount; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, + // Only use the high pragma threshold when we have a target unroll factor such + // as with "#pragma unroll N" or a pragma indicating full unrolling and the + // trip count is known. Otherwise we rely on the standard threshold to + // heuristically select a reasonable unroll count. + bool UsePragmaThreshold = + PragmaCount > 0 || + ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0); + + selectThresholds(L, UsePragmaThreshold, UP, Threshold, PartialThreshold, PercentDynamicCostSavedThreshold, DynamicCostSavingsDiscount); @@ -824,8 +984,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // The loop isn't that small, but we still can fully unroll it if that // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. - if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount)) + if (Optional<EstimatedUnrollCost> Cost = + analyzeLoopUnrollCost(L, TripCount, DT, *SE, TTI, + Threshold + DynamicCostSavingsDiscount)) if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold, DynamicCostSavingsDiscount, Cost->UnrolledCost, Cost->RolledDynamicCost)) { @@ -840,14 +1001,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; - bool AllowRuntime = - (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime); + bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || + (UserRuntime ? CurrentRuntime : UP.Runtime); // Don't unroll a runtime trip count loop with unroll full pragma. if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { AllowRuntime = false; } if (Unrolling == Partial) { - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + bool AllowPartial = PragmaEnableUnroll || + (UserAllowPartial ? CurrentAllowPartial : UP.Partial); if (!AllowPartial && !CountSetExplicitly) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); @@ -887,23 +1049,27 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); - if (PragmaFullUnroll && PragmaCount == 0) { - if (TripCount && Count != TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because unrolled size is too large."); - } else if (!TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); - } - } else if (PragmaCount > 0 && Count != OriginalCount) { + if ((PragmaCount > 0) && Count != OriginalCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, "Unable to unroll loop the number of times directed by " "unroll_count pragma because unrolled size is too large."); + } else if (PragmaFullUnroll && !TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop as directed by unroll(enable) pragma because " + "unrolled size is too large."); + } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll pragma because " + "unrolled size is too large."); } } @@ -915,7 +1081,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Unroll the loop. if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, - TripMultiple, LI, this, &LPM, &AC)) + TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index cbc563bd8998..95d7f8a3beda 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -37,6 +38,10 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -70,6 +75,19 @@ static cl::opt<unsigned> Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden); +static cl::opt<bool> +LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency", + cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to access PGO " + "heuristics to minimize code growth in cold regions.")); + +static cl::opt<unsigned> +ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden, + cl::desc("Coldness threshold in percentage. The loop header frequency " + "(relative to the entry frequency) is compared with this " + "threshold to determine if non-trivial unswitching should be " + "enabled.")); + namespace { class LUAnalysisCache { @@ -148,12 +166,19 @@ namespace { LPPassManager *LPM; AssumptionCache *AC; - // LoopProcessWorklist - Used to check if second loop needs processing - // after RewriteLoopBodyWithConditionConstant rewrites first loop. + // Used to check if second loop needs processing after + // RewriteLoopBodyWithConditionConstant rewrites first loop. std::vector<Loop*> LoopProcessWorklist; LUAnalysisCache BranchesInfo; + bool EnabledPGO; + + // BFI and ColdEntryFreq are only used when PGO and + // LoopUnswitchWithBlockFrequency are enabled. + BlockFrequencyInfo BFI; + BlockFrequency ColdEntryFreq; + bool OptimizeForSize; bool redoLoop; @@ -192,9 +217,11 @@ namespace { AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } private: @@ -210,7 +237,10 @@ namespace { /// Split all of the edges from inside the loop to their exit blocks. /// Update the appropriate Phi nodes as we do so. - void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks); + void SplitExitEdges(Loop *L, + const SmallVectorImpl<BasicBlock *> &ExitBlocks); + + bool TryTrivialLoopUnswitch(bool &Changed); bool UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI = nullptr); @@ -229,9 +259,6 @@ namespace { TerminatorInst *TI); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr, - BasicBlock **LoopExit = nullptr); - }; } @@ -367,9 +394,8 @@ Pass *llvm::createLoopUnswitchPass(bool Os) { return new LoopUnswitch(Os); } -/// FindLIVLoopCondition - Cond is a condition that occurs in L. If it is -/// invariant in the loop, or has an invariant piece, return the invariant. -/// Otherwise, return null. +/// Cond is a condition that occurs in L. If it is invariant in the loop, or has +/// an invariant piece, return the invariant. Otherwise, return null. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // We started analyze new instruction, increment scanned instructions counter. @@ -411,11 +437,23 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { *L->getHeader()->getParent()); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LPM = &LPM_Ref; - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); currentLoop = L; Function *F = currentLoop->getHeader()->getParent(); + + EnabledPGO = F->getEntryCount().hasValue(); + + if (LoopUnswitchWithBlockFrequency && EnabledPGO) { + BranchProbabilityInfo BPI(*F, *LI); + BFI.calculate(*L->getHeader()->getParent(), BPI, *LI); + + // Use BranchProbability to compute a minimum frequency based on + // function entry baseline frequency. Loops with headers below this + // frequency are considered as cold. + const BranchProbability ColdProb(ColdnessThreshold, 100); + ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb; + } + bool Changed = false; do { assert(currentLoop->isLCSSAForm(*DT)); @@ -423,16 +461,13 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { Changed |= processCurrentLoop(); } while(redoLoop); - if (Changed) { - // FIXME: Reconstruct dom info, because it is not preserved properly. - if (DT) - DT->recalculate(*F); - } + // FIXME: Reconstruct dom info, because it is not preserved properly. + if (Changed) + DT->recalculate(*F); return Changed; } -/// processCurrentLoop - Do actual work and unswitch loop if possible -/// and profitable. +/// Do actual work and unswitch loop if possible and profitable. bool LoopUnswitch::processCurrentLoop() { bool Changed = false; @@ -452,14 +487,48 @@ bool LoopUnswitch::processCurrentLoop() { LLVMContext &Context = loopHeader->getContext(); - // Probably we reach the quota of branches for this loop. If so - // stop unswitching. + // Analyze loop cost, and stop unswitching if loop content can not be duplicated. if (!BranchesInfo.countLoop( currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *currentLoop->getHeader()->getParent()), AC)) return false; + // Try trivial unswitch first before loop over other basic blocks in the loop. + if (TryTrivialLoopUnswitch(Changed)) { + return true; + } + + // Do not unswitch loops containing convergent operations, as we might be + // making them control dependent on the unswitch value when they were not + // before. + // FIXME: This could be refined to only bail if the convergent operation is + // not already control-dependent on the unswitch value. + for (const auto BB : currentLoop->blocks()) { + for (auto &I : *BB) { + auto CS = CallSite(&I); + if (!CS) continue; + if (CS.hasFnAttr(Attribute::Convergent)) + return false; + } + } + + // Do not do non-trivial unswitch while optimizing for size. + // FIXME: Use Function::optForSize(). + if (OptimizeForSize || + loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) + return false; + + if (LoopUnswitchWithBlockFrequency && EnabledPGO) { + // Compute the weighted frequency of the hottest block in the + // loop (loopHeader in this case since inner loops should be + // processed before outer loop). If it is less than ColdFrequency, + // we should not unswitch. + BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader); + if (LoopEntryFreq < ColdEntryFreq) + return false; + } + // Loop over all of the basic blocks in the loop. If we find an interior // block that is branching on a loop-invariant condition, we can unswitch this // loop. @@ -528,8 +597,8 @@ bool LoopUnswitch::processCurrentLoop() { return Changed; } -/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the -/// loop with no side effects (including infinite loops). +/// Check to see if all paths from BB exit the loop with no side effects +/// (including infinite loops). /// /// If true, we return true and set ExitBB to the block we /// exit through. @@ -566,9 +635,9 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, return true; } -/// isTrivialLoopExitBlock - Return true if the specified block unconditionally -/// leads to an exit from the specified loop, and has no side-effects in the -/// process. If so, return the block that is exited to, otherwise return null. +/// Return true if the specified block unconditionally leads to an exit from +/// the specified loop, and has no side-effects in the process. If so, return +/// the block that is exited to, otherwise return null. static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { std::set<BasicBlock*> Visited; Visited.insert(L->getHeader()); // Branches to header make infinite loops. @@ -578,105 +647,11 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { return nullptr; } -/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is -/// trivial: that is, that the condition controls whether or not the loop does -/// anything at all. If this is a trivial condition, unswitching produces no -/// code duplications (equivalently, it produces a simpler loop and a new empty -/// loop, which gets deleted). -/// -/// If this is a trivial condition, return true, otherwise return false. When -/// returning true, this sets Cond and Val to the condition that controls the -/// trivial condition: when Cond dynamically equals Val, the loop is known to -/// exit. Finally, this sets LoopExit to the BB that the loop exits to when -/// Cond == Val. -/// -bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, - BasicBlock **LoopExit) { - BasicBlock *Header = currentLoop->getHeader(); - TerminatorInst *HeaderTerm = Header->getTerminator(); - LLVMContext &Context = Header->getContext(); - - BasicBlock *LoopExitBB = nullptr; - if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { - // If the header block doesn't end with a conditional branch on Cond, we - // can't handle it. - if (!BI->isConditional() || BI->getCondition() != Cond) - return false; - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, - BI->getSuccessor(0)))) { - if (Val) *Val = ConstantInt::getTrue(Context); - } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, - BI->getSuccessor(1)))) { - if (Val) *Val = ConstantInt::getFalse(Context); - } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { - // If this isn't a switch on Cond, we can't handle it. - if (SI->getCondition() != Cond) return false; - - // Check to see if a successor of the switch is guaranteed to go to the - // latch block or exit through a one exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - // Note that we can't trivially unswitch on the default case or - // on already unswitched cases. - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - BasicBlock *LoopExitCandidate; - if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, - i.getCaseSuccessor()))) { - // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt *CaseVal = i.getCaseValue(); - - // Check that it was not unswitched before, since already unswitched - // trivial vals are looks trivial too. - if (BranchesInfo.isUnswitched(SI, CaseVal)) - continue; - LoopExitBB = LoopExitCandidate; - if (Val) *Val = CaseVal; - break; - } - } - } - - // If we didn't find a single unique LoopExit block, or if the loop exit block - // contains phi nodes, this isn't trivial. - if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) - return false; // Can't handle this. - - if (LoopExit) *LoopExit = LoopExitBB; - - // We already know that nothing uses any scalar values defined inside of this - // loop. As such, we just have to check to see if this loop will execute any - // side-effecting instructions (e.g. stores, calls, volatile loads) in the - // part of the loop that the code *would* execute. We already checked the - // tail, check the header now. - for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I) - if (I->mayHaveSideEffects()) - return false; - return true; -} - -/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when -/// LoopCond == Val to simplify the loop. If we decide that this is profitable, +/// We have found that we can unswitch currentLoop when LoopCond == Val to +/// simplify the loop. If we decide that this is profitable, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI) { - Function *F = loopHeader->getParent(); - Constant *CondVal = nullptr; - BasicBlock *ExitBlock = nullptr; - - if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { - // If the condition is trivial, always unswitch. There is no code growth - // for this case. - UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock, TI); - return true; - } - // Check to see if it would be profitable to unswitch current loop. if (!BranchesInfo.CostAllowsUnswitching()) { DEBUG(dbgs() << "NOT unswitching loop %" @@ -687,32 +662,27 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, return false; } - // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) - return false; - UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI); return true; } -/// CloneLoop - Recursively clone the specified loop and all of its children, +/// Recursively clone the specified loop and all of its children, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop *New = new Loop(); - LPM->insertLoop(New, PL); + Loop &New = LPM->addLoop(PL); // Add all of the blocks in L to the new loop. for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) if (LI->getLoopFor(*I) == L) - New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); + New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) - CloneLoop(*I, New, VM, LI, LPM); + CloneLoop(*I, &New, VM, LI, LPM); - return New; + return &New; } static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, @@ -744,15 +714,15 @@ static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, } } // fallthrough. + case LLVMContext::MD_make_implicit: case LLVMContext::MD_dbg: DstInst->setMetadata(MD.first, MD.second); } } } -/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values -/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest. Insert the -/// code immediately before InsertPt. +/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, +/// otherwise branch to FalseDest. Insert the code immediately before InsertPt. void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, @@ -782,11 +752,11 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, SplitCriticalEdge(BI, 1, Options); } -/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable -/// condition in it (a cond branch from its header block to its latch block, -/// where the path through the loop that doesn't execute its body has no -/// side-effects), unswitch it. This doesn't involve any code duplication, just -/// moving the conditional branch outside of the loop and updating loop info. +/// Given a loop that has a trivial unswitchable condition in it (a cond branch +/// from its header block to its latch block, where the path through the loop +/// that doesn't execute its body has no side-effects), unswitch it. This +/// doesn't involve any code duplication, just moving the conditional branch +/// outside of the loop and updating loop info. void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, BasicBlock *ExitBlock, TerminatorInst *TI) { @@ -810,7 +780,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); + BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI); // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. @@ -829,8 +799,155 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, ++NumTrivial; } -/// SplitExitEdges - Split all of the edges from inside the loop to their exit -/// blocks. Update the appropriate Phi nodes as we do so. +/// Check if the first non-constant condition starting from the loop header is +/// a trivial unswitch condition: that is, a condition controls whether or not +/// the loop does anything at all. If it is a trivial condition, unswitching +/// produces no code duplications (equivalently, it produces a simpler loop and +/// a new empty loop, which gets deleted). Therefore always unswitch trivial +/// condition. +bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { + BasicBlock *CurrentBB = currentLoop->getHeader(); + TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); + LLVMContext &Context = CurrentBB->getContext(); + + // If loop header has only one reachable successor (currently via an + // unconditional branch or constant foldable conditional branch, but + // should also consider adding constant foldable switch instruction in + // future), we should keep looking for trivial condition candidates in + // the successor as well. An alternative is to constant fold conditions + // and merge successors into loop header (then we only need to check header's + // terminator). The reason for not doing this in LoopUnswitch pass is that + // it could potentially break LoopPassManager's invariants. Folding dead + // branches could either eliminate the current loop or make other loops + // unreachable. LCSSA form might also not be preserved after deleting + // branches. The following code keeps traversing loop header's successors + // until it finds the trivial condition candidate (condition that is not a + // constant). Since unswitching generates branches with constant conditions, + // this scenario could be very common in practice. + SmallSet<BasicBlock*, 8> Visited; + + while (true) { + // If we exit loop or reach a previous visited block, then + // we can not reach any trivial condition candidates (unfoldable + // branch instructions or switch instructions) and no unswitch + // can happen. Exit and return false. + if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second) + return false; + + // Check if this loop will execute any side-effecting instructions (e.g. + // stores, calls, volatile loads) in the part of the loop that the code + // *would* execute. Check the header first. + for (Instruction &I : *CurrentBB) + if (I.mayHaveSideEffects()) + return false; + + // FIXME: add check for constant foldable switch instructions. + if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { + if (BI->isUnconditional()) { + CurrentBB = BI->getSuccessor(0); + } else if (BI->getCondition() == ConstantInt::getTrue(Context)) { + CurrentBB = BI->getSuccessor(0); + } else if (BI->getCondition() == ConstantInt::getFalse(Context)) { + CurrentBB = BI->getSuccessor(1); + } else { + // Found a trivial condition candidate: non-foldable conditional branch. + break; + } + } else { + break; + } + + CurrentTerm = CurrentBB->getTerminator(); + } + + // CondVal is the condition that controls the trivial condition. + // LoopExitBB is the BasicBlock that loop exits when meets trivial condition. + Constant *CondVal = nullptr; + BasicBlock *LoopExitBB = nullptr; + + if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { + // If this isn't branching on an invariant condition, we can't unswitch it. + if (!BI->isConditional()) + return false; + + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), + currentLoop, Changed); + + // Unswitch only if the trivial condition itself is an LIV (not + // partial LIV which could occur in and/or) + if (!LoopCond || LoopCond != BI->getCondition()) + return false; + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any + // side-effects. If so, determine the value of Cond that causes + // it to do this. + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(0)))) { + CondVal = ConstantInt::getTrue(Context); + } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(1)))) { + CondVal = ConstantInt::getFalse(Context); + } + + // If we didn't find a single unique LoopExit block, or if the loop exit + // block contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, + CurrentTerm); + ++NumBranches; + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { + // If this isn't switching on an invariant condition, we can't unswitch it. + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + + // Unswitch only if the trivial condition itself is an LIV (not + // partial LIV which could occur in and/or) + if (!LoopCond || LoopCond != SI->getCondition()) + return false; + + // Check to see if a successor of the switch is guaranteed to go to the + // latch block or exit through a one exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. + // Note that we can't trivially unswitch on the default case or + // on already unswitched cases. + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock *LoopExitCandidate; + if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, + i.getCaseSuccessor()))) { + // Okay, we found a trivial case, remember the value that is trivial. + ConstantInt *CaseVal = i.getCaseValue(); + + // Check that it was not unswitched before, since already unswitched + // trivial vals are looks trivial too. + if (BranchesInfo.isUnswitched(SI, CaseVal)) + continue; + LoopExitBB = LoopExitCandidate; + CondVal = CaseVal; + break; + } + } + + // If we didn't find a single unique LoopExit block, or if the loop exit + // block contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, + nullptr); + ++NumSwitches; + return true; + } + return false; +} + +/// Split all of the edges from inside the loop to their exit blocks. +/// Update the appropriate Phi nodes as we do so. void LoopUnswitch::SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks){ @@ -841,15 +958,14 @@ void LoopUnswitch::SplitExitEdges(Loop *L, // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", - /*AliasAnalysis*/ nullptr, DT, LI, + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, /*PreserveLCSSA*/ true); } } -/// UnswitchNontrivialCondition - We determined that the loop is profitable -/// to unswitch when LIC equal Val. Split it into loop versions and test the -/// condition outside of either loop. Return the loops created as Out1/Out2. +/// We determined that the loop is profitable to unswitch when LIC equal Val. +/// Split it into loop versions and test the condition outside of either loop. +/// Return the loops created as Out1/Out2. void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, Loop *L, TerminatorInst *TI) { Function *F = loopHeader->getParent(); @@ -858,8 +974,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, << " blocks] in Function " << F->getName() << " when '" << *Val << "' == " << *LIC << "\n"); - if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) - SE->forgetLoop(L); + if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) + SEWP->getSE().forgetLoop(L); LoopBlocks.clear(); NewBlocks.clear(); @@ -901,8 +1017,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // Splice the newly inserted blocks into the function right before the // original preheader. - F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), - NewBlocks[0], F->end()); + F->getBasicBlockList().splice(NewPreheader->getIterator(), + F->getBasicBlockList(), + NewBlocks[0]->getIterator(), F->end()); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. @@ -944,7 +1061,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { PHINode *PN = PHINode::Create(LPad->getType(), 0, "", - ExitSucc->getFirstInsertionPt()); + &*ExitSucc->getFirstInsertionPt()); for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); I != E; ++I) { @@ -960,7 +1077,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) - RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + RemapInstruction(&*I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); // Rewrite the original preheader to select between versions of the loop. BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); @@ -994,8 +1112,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); } -/// RemoveFromWorklist - Remove all instances of I from the worklist vector -/// specified. +/// Remove all instances of I from the worklist vector specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { @@ -1003,7 +1120,7 @@ static void RemoveFromWorklist(Instruction *I, Worklist.end()); } -/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the +/// When we find that I really equals V, remove I from the /// program, replacing all uses with V and update the worklist. static void ReplaceUsesOfWith(Instruction *I, Value *V, std::vector<Instruction*> &Worklist, @@ -1025,9 +1142,9 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has -// the value specified by Val in the specified loop, or we know it does NOT have -// that value. Rewrite any uses of LIC or of properties correlated to it. +/// We know either that the value LIC has the value specified by Val in the +/// specified loop, or we know it does NOT have that value. +/// Rewrite any uses of LIC or of properties correlated to it. void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Constant *Val, bool IsEqual) { @@ -1138,18 +1255,16 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // domtree here -- instead we force it to do a full recomputation // after the pass is complete -- but we do need to inform it of // new blocks. - if (DT) - DT->addNewBlock(Abort, NewSISucc); + DT->addNewBlock(Abort, NewSISucc); } SimplifyCode(Worklist, L); } -/// SimplifyCode - Okay, now that we have simplified some instructions in the -/// loop, walk over it and constant prop, dce, and fold control flow where -/// possible. Note that this is effectively a very simple loop-structure-aware -/// optimizer. During processing of this loop, L could very well be deleted, so -/// it must not be used. +/// Now that we have simplified some instructions in the loop, walk over it and +/// constant prop, dce, and fold control flow where possible. Note that this is +/// effectively a very simple loop-structure-aware optimizer. During processing +/// of this loop, L could very well be deleted, so it must not be used. /// /// FIXME: When the loop optimizer is more mature, separate this out to a new /// pass. @@ -1207,8 +1322,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { Succ->replaceAllUsesWith(Pred); // Move all of the successor contents from Succ to Pred. - Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), - Succ->end()); + Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), + Succ->begin(), Succ->end()); LPM->deleteSimpleAnalysisValue(BI, L); BI->eraseFromParent(); RemoveFromWorklist(BI, Worklist); diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 3314e1ed41ab..41511bcb7b04 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -22,7 +22,7 @@ using namespace llvm; #define DEBUG_TYPE "loweratomic" static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { - IRBuilder<> Builder(CXI->getParent(), CXI); + IRBuilder<> Builder(CXI); Value *Ptr = CXI->getPointerOperand(); Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); @@ -41,7 +41,7 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { } static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { - IRBuilder<> Builder(RMWI->getParent(), RMWI); + IRBuilder<> Builder(RMWI); Value *Ptr = RMWI->getPointerOperand(); Value *Val = RMWI->getValOperand(); @@ -120,7 +120,7 @@ namespace { return false; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { - Instruction *Inst = DI++; + Instruction *Inst = &*DI++; if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) Changed |= LowerFenceInst(FI); else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0c47cbd5bfda..2ace902a7a1b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -139,7 +139,7 @@ static bool lowerExpectIntrinsic(Function &F) { ExpectIntrinsicsHandled++; } - // remove llvm.expect intrinsics. + // Remove llvm.expect intrinsics. for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { CallInst *CI = dyn_cast<CallInst>(BI++); if (!CI) diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 85012afc80ac..0333bf2284e1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -30,7 +31,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" -#include <list> +#include <algorithm> using namespace llvm; #define DEBUG_TYPE "memcpyopt" @@ -71,9 +72,9 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, return Offset; } -/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a -/// constant offset, and return that constant offset. For example, Ptr1 might -/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. +/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and +/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 +/// might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, const DataLayout &DL) { Ptr1 = Ptr1->stripPointerCasts(); @@ -125,7 +126,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, } -/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// Represents a range of memset'd bytes with the ByteVal value. /// This allows us to analyze stores like: /// store 0 -> P+1 /// store 0 -> P+0 @@ -164,8 +165,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If any of the stores are a memset, then it is always good to extend the // memset. - for (unsigned i = 0, e = TheStores.size(); i != e; ++i) - if (!isa<StoreInst>(TheStores[i])) + for (Instruction *SI : TheStores) + if (!isa<StoreInst>(SI)) return true; // Assume that the code generator is capable of merging pairs of stores @@ -189,7 +190,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { unsigned NumPointerStores = Bytes / MaxIntSize; // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; + unsigned NumByteStores = Bytes % MaxIntSize; // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 @@ -200,15 +201,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { namespace { class MemsetRanges { - /// Ranges - A sorted list of the memset ranges. We use std::list here - /// because each element is relatively large and expensive to copy. - std::list<MemsetRange> Ranges; - typedef std::list<MemsetRange>::iterator range_iterator; + /// A sorted list of the memset ranges. + SmallVector<MemsetRange, 8> Ranges; + typedef SmallVectorImpl<MemsetRange>::iterator range_iterator; const DataLayout &DL; public: MemsetRanges(const DataLayout &DL) : DL(DL) {} - typedef std::list<MemsetRange>::const_iterator const_iterator; + typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } @@ -240,26 +240,20 @@ public: } // end anon namespace -/// addRange - Add a new store to the MemsetRanges data structure. This adds a +/// Add a new store to the MemsetRanges data structure. This adds a /// new range for the specified store at the specified offset, merging into /// existing ranges as appropriate. -/// -/// Do a linear search of the ranges to see if this can be joined and/or to -/// find the insertion point in the list. We keep the ranges sorted for -/// simplicity here. This is a linear search of a linked list, which is ugly, -/// however the number of ranges is limited, so this won't get crazy slow. void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst) { int64_t End = Start+Size; - range_iterator I = Ranges.begin(), E = Ranges.end(); - while (I != E && Start > I->End) - ++I; + range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start, + [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; }); // We now know that I == E, in which case we didn't find anything to merge // with, or that Start <= I->End. If End < I->Start or I == E, then we need // to insert a new range. Handle this now. - if (I == E || End < I->Start) { + if (I == Ranges.end() || End < I->Start) { MemsetRange &R = *Ranges.insert(I, MemsetRange()); R.Start = Start; R.End = End; @@ -295,7 +289,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, if (End > I->End) { I->End = End; range_iterator NextI = I; - while (++NextI != E && End >= NextI->Start) { + while (++NextI != Ranges.end() && End >= NextI->Start) { // Merge the range in. I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); if (NextI->End > I->End) @@ -331,9 +325,9 @@ namespace { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -357,7 +351,7 @@ namespace { char MemCpyOpt::ID = 0; } -// createMemCpyOptPass - The public interface to this file... +/// The public interface to this file... FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", @@ -366,14 +360,15 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -/// tryMergingIntoMemset - When scanning forward over instructions, we look for -/// some other patterns to fold away. In particular, this looks for stores to -/// neighboring locations of memory. If it sees enough consecutive ones, it -/// attempts to merge them together into a memcpy/memset. +/// When scanning forward over instructions, we look for some other patterns to +/// fold away. In particular, this looks for stores to neighboring locations of +/// memory. If it sees enough consecutive ones, it attempts to merge them +/// together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { const DataLayout &DL = StartInst->getModule()->getDataLayout(); @@ -384,7 +379,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // are stored. MemsetRanges Ranges(DL); - BasicBlock::iterator BI = StartInst; + BasicBlock::iterator BI(StartInst); for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { // If the instruction is readnone, ignore it, otherwise bail out. We @@ -439,14 +434,12 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // If we create any memsets, we put it right before the first instruction that // isn't part of the memset block. This ensure that the memset is dominated // by any addressing instruction needed by the start of the block. - IRBuilder<> Builder(BI); + IRBuilder<> Builder(&*BI); // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; - for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); - I != E; ++I) { - const MemsetRange &Range = *I; + for (const MemsetRange &Range : Ranges) { if (Range.TheStores.size() == 1) continue; @@ -470,19 +463,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); DEBUG(dbgs() << "Replace stores:\n"; - for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) - dbgs() << *Range.TheStores[i] << '\n'; + for (Instruction *SI : Range.TheStores) + dbgs() << *SI << '\n'; dbgs() << "With: " << *AMemSet << '\n'); if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); // Zap all the stores. - for (SmallVectorImpl<Instruction *>::const_iterator - SI = Range.TheStores.begin(), - SE = Range.TheStores.end(); SI != SE; ++SI) { - MD->removeInstruction(*SI); - (*SI)->eraseFromParent(); + for (Instruction *SI : Range.TheStores) { + MD->removeInstruction(SI); + SI->eraseFromParent(); } ++NumMemSetInfer; } @@ -493,6 +484,16 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; + + // Avoid merging nontemporal stores since the resulting + // memcpy/memset would not be able to preserve the nontemporal hint. + // In theory we could teach how to propagate the !nontemporal metadata to + // memset calls. However, that change would force the backend to + // conservatively expand !nontemporal memset calls back to sequences of + // store instructions (effectively undoing the merging). + if (SI->getMetadata(LLVMContext::MD_nontemporal)) + return false; + const DataLayout &DL = SI->getModule()->getDataLayout(); // Detect cases where we're performing call slot forwarding, but @@ -509,11 +510,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); - for (BasicBlock::iterator I = --BasicBlock::iterator(SI), - E = C; I != E; --I) { - if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { + for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); + I != E; --I) { + if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } @@ -554,7 +555,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { - BBI = I; // Don't invalidate iterator. + BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -567,14 +568,14 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) { - BBI = I; // Don't invalidate iterator. + BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; } -/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// Takes a memcpy and a call that it depends on, /// and checks for the possibility of a call slot optimization by having /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, @@ -710,12 +711,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // unexpected manner, for example via a global, which we deduce from // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); // If necessary, perform additional analysis. - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) return false; // All the checks have passed, so do the transformation. @@ -749,11 +750,9 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Update AA metadata // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - }; + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_invariant_group}; combineMetadata(C, cpy, KnownIDs); // Remove the memcpy. @@ -763,10 +762,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return true; } -/// processMemCpyMemCpyDependence - We've found that the (upward scanning) -/// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to -/// copy from MDep's input if we can. -/// +/// We've found that the (upward scanning) memory dependence of memcpy 'M' is +/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // We can only transforms memcpy's where the dest of one is the source of the // other. @@ -788,7 +785,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: @@ -802,8 +799,9 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = MD->getPointerDependencyFrom( - MemoryLocation::getForSource(MDep), false, M, M->getParent()); + MemDepResult SourceDep = + MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, + M->getIterator(), M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -860,8 +858,9 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, return false; // Check that there are no other dependencies on the memset destination. - MemDepResult DstDepInfo = MD->getPointerDependencyFrom( - MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent()); + MemDepResult DstDepInfo = + MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false, + MemCpy->getIterator(), MemCpy->getParent()); if (DstDepInfo.getInst() != MemSet) return false; @@ -936,7 +935,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, return true; } -/// processMemCpy - perform simplification of memcpy's. If we have memcpy A +/// Perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy @@ -998,8 +997,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { } MemoryLocation SrcLoc = MemoryLocation::getForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, - M, M->getParent()); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( + SrcLoc, true, M->getIterator(), M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) @@ -1037,10 +1036,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return false; } -/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst -/// are guaranteed not to alias. +/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed +/// not to alias. bool MemCpyOpt::processMemMove(MemMoveInst *M) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); if (!TLI->has(LibFunc::memmove)) return false; @@ -1053,12 +1052,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); // If not, then we know we can transform this. - Module *Mod = M->getParent()->getParent()->getParent(); Type *ArgTys[3] = { M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType() }; - M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, - ArgTys)); + M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), + Intrinsic::memcpy, ArgTys)); // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. @@ -1068,7 +1066,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { return true; } -/// processByValArgument - This is called on every byval argument in call sites. +/// This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); // Find out what feeds this byval argument. @@ -1076,8 +1074,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom( - MemoryLocation(ByValArg, ByValSize), true, CS.getInstruction(), - CS.getInstruction()->getParent()); + MemoryLocation(ByValArg, ByValSize), true, + CS.getInstruction()->getIterator(), CS.getInstruction()->getParent()); if (!DepInfo.isClobber()) return false; @@ -1119,9 +1117,9 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = - MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, - CS.getInstruction(), MDep->getParent()); + MemDepResult SourceDep = MD->getPointerDependencyFrom( + MemoryLocation::getForSource(MDep), false, + CS.getInstruction()->getIterator(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -1140,7 +1138,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { return true; } -/// iterateOnFunction - Executes one iteration of MemCpyOpt. +/// Executes one iteration of MemCpyOpt. bool MemCpyOpt::iterateOnFunction(Function &F) { bool MadeChange = false; @@ -1148,7 +1146,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. - Instruction *I = BI++; + Instruction *I = &*BI++; bool RepeatInstruction = false; @@ -1177,9 +1175,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { return MadeChange; } -// MemCpyOpt::runOnFunction - This is the main transformation entry point for a -// function. -// +/// This is the main transformation entry point for a function. bool MemCpyOpt::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 643f3740eedd..c812d618c16a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -78,6 +78,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -91,6 +92,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> + using namespace llvm; #define DEBUG_TYPE "mldst-motion" @@ -106,7 +108,7 @@ class MergedLoadStoreMotion : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid - explicit MergedLoadStoreMotion(void) + MergedLoadStoreMotion() : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); } @@ -116,10 +118,11 @@ public: private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); - AU.addPreserved<AliasAnalysis>(); } // Helper routines @@ -156,7 +159,7 @@ private: }; char MergedLoadStoreMotion::ID = 0; -} +} // anonymous namespace /// /// \brief createMergedLoadStoreMotionPass - The public interface to this file. @@ -169,7 +172,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -236,12 +240,11 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { /// being loaded or protect against the load from happening /// it is considered a hoist barrier. /// - bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI) { MemoryLocation Loc = MemoryLocation::get(LI); - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); + return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod); } /// @@ -256,7 +259,7 @@ LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; ++BBI) { - Instruction *Inst = BBI; + Instruction *Inst = &*BBI; // Only merge and hoist loads when their result in used only in BB if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1)) @@ -293,7 +296,7 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, // Intersect optional metadata. HoistCand->intersectOptionalDataWith(ElseInst); - HoistCand->dropUnknownMetadata(); + HoistCand->dropUnknownNonDebugMetadata(); // Prepend point for instruction insert Instruction *HoistPt = BB->getTerminator(); @@ -363,8 +366,7 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { int NLoads = 0; for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); BBI != BBE;) { - - Instruction *I = BBI; + Instruction *I = &*BBI; ++BBI; // Only move non-simple (atomic, volatile) loads. @@ -394,11 +396,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { /// value being stored or protect against the store from /// happening it is considered a sink barrier. /// - bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, const Instruction &End, MemoryLocation Loc) { - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); + return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef); } /// @@ -438,23 +439,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Create a phi if the values mismatch. - PHINode *NewPN = 0; + PHINode *NewPN = nullptr; Value *Opd1 = S0->getValueOperand(); Value *Opd2 = S1->getValueOperand(); if (Opd1 != Opd2) { NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", - BB->begin()); + &BB->front()); NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); - if (NewPN->getType()->getScalarType()->isPointerTy()) { - // AA needs to be informed when a PHI-use of the pointer value is added - for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { - unsigned J = PHINode::getOperandNumForIncomingValue(I); - AA->addEscapingUse(NewPN->getOperandUse(J)); - } - if (MD) - MD->invalidateCachedPointerInfo(NewPN); - } + if (MD && NewPN->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(NewPN); } return NewPN; } @@ -479,12 +473,12 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); // Intersect optional metadata. S0->intersectOptionalDataWith(S1); - S0->dropUnknownMetadata(); + S0->dropUnknownNonDebugMetadata(); // Create the new store to be inserted at the join point. StoreInst *SNew = (StoreInst *)(S0->clone()); Instruction *ANew = A0->clone(); - SNew->insertBefore(InsertPt); + SNew->insertBefore(&*InsertPt); ANew->insertBefore(SNew); assert(S0->getParent() == A0->getParent()); @@ -566,12 +560,13 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { } return MergedStores; } + /// /// \brief Run the transformation for each function /// bool MergedLoadStoreMotion::runOnFunction(Function &F) { MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool Changed = false; DEBUG(dbgs() << "Instruction Merger\n"); @@ -579,7 +574,7 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { - BasicBlock *BB = FI++; + BasicBlock *BB = &*FI++; // Hoist equivalent loads and sink stores // outside diamonds when possible diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index f42f8306fccc..c8f885e7eec5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -71,8 +71,8 @@ // // Limitations and TODO items: // -// 1) We only considers n-ary adds for now. This should be extended and -// generalized. +// 1) We only considers n-ary adds and muls for now. This should be extended +// and generalized. // //===----------------------------------------------------------------------===// @@ -110,11 +110,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<TargetLibraryInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.setPreservesCFG(); @@ -145,12 +145,23 @@ private: unsigned I, Value *LHS, Value *RHS, Type *IndexedType); - // Reassociate Add for better CSE. - Instruction *tryReassociateAdd(BinaryOperator *I); - // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed. - Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I); - // Rewrites I to LHS + RHS if LHS is computed already. - Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I); + // Reassociate binary operators for better CSE. + Instruction *tryReassociateBinaryOp(BinaryOperator *I); + + // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly + // passed. + Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS, + BinaryOperator *I); + // Rewrites I to (LHS op RHS) if LHS is computed already. + Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS, + BinaryOperator *I); + + // Tries to match Op1 and Op2 by using V. + bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2); + + // Gets SCEV for (LHS op RHS). + const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS, + const SCEV *RHS); // Returns the closest dominator of \c Dominatee that computes // \c CandidateExpr. Returns null if not found. @@ -161,11 +172,6 @@ private: // GEP's pointer size, i.e., whether Index needs to be sign-extended in order // to be an index of GEP. bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP); - // Returns whether V is known to be non-negative at context \c Ctxt. - bool isKnownNonNegative(Value *V, Instruction *Ctxt); - // Returns whether AO may sign overflow at context \c Ctxt. It computes a - // conservative result -- it answers true when not sure. - bool maySignOverflow(AddOperator *AO, Instruction *Ctxt); AssumptionCache *AC; const DataLayout *DL; @@ -182,7 +188,7 @@ private: // foo(a + b); // if (p2) // bar(a + b); - DenseMap<const SCEV *, SmallVector<Instruction *, 2>> SeenExprs; + DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs; }; } // anonymous namespace @@ -191,7 +197,7 @@ INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation", @@ -207,7 +213,7 @@ bool NaryReassociate::runOnFunction(Function &F) { AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -224,6 +230,7 @@ static bool isPotentiallyNaryReassociable(Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::GetElementPtr: + case Instruction::Mul: return true; default: return false; @@ -239,19 +246,21 @@ bool NaryReassociate::doOneIteration(Function &F) { Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { BasicBlock *BB = Node->getBlock(); for (auto I = BB->begin(); I != BB->end(); ++I) { - if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { - const SCEV *OldSCEV = SE->getSCEV(I); - if (Instruction *NewI = tryReassociate(I)) { + if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) { + const SCEV *OldSCEV = SE->getSCEV(&*I); + if (Instruction *NewI = tryReassociate(&*I)) { Changed = true; - SE->forgetValue(I); + SE->forgetValue(&*I); I->replaceAllUsesWith(NewI); - RecursivelyDeleteTriviallyDeadInstructions(I, TLI); - I = NewI; + // If SeenExprs constains I's WeakVH, that entry will be replaced with + // nullptr. + RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI); + I = NewI->getIterator(); } // Add the rewritten instruction to SeenExprs; the original instruction // is deleted. - const SCEV *NewSCEV = SE->getSCEV(I); - SeenExprs[NewSCEV].push_back(I); + const SCEV *NewSCEV = SE->getSCEV(&*I); + SeenExprs[NewSCEV].push_back(WeakVH(&*I)); // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) // is equivalent to I. However, ScalarEvolution::getSCEV may // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose @@ -271,7 +280,7 @@ bool NaryReassociate::doOneIteration(Function &F) { // // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll. if (NewSCEV != OldSCEV) - SeenExprs[OldSCEV].push_back(I); + SeenExprs[OldSCEV].push_back(WeakVH(&*I)); } } } @@ -281,7 +290,8 @@ bool NaryReassociate::doOneIteration(Function &F) { Instruction *NaryReassociate::tryReassociate(Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: - return tryReassociateAdd(cast<BinaryOperator>(I)); + case Instruction::Mul: + return tryReassociateBinaryOp(cast<BinaryOperator>(I)); case Instruction::GetElementPtr: return tryReassociateGEP(cast<GetElementPtrInst>(I)); default: @@ -352,27 +362,6 @@ bool NaryReassociate::requiresSignExtension(Value *Index, return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits; } -bool NaryReassociate::isKnownNonNegative(Value *V, Instruction *Ctxt) { - bool NonNegative, Negative; - // TODO: ComputeSignBits is expensive. Consider caching the results. - ComputeSignBit(V, NonNegative, Negative, *DL, 0, AC, Ctxt, DT); - return NonNegative; -} - -bool NaryReassociate::maySignOverflow(AddOperator *AO, Instruction *Ctxt) { - if (AO->hasNoSignedWrap()) - return false; - - Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); - // If LHS or RHS has the same sign as the sum, AO doesn't sign overflow. - // TODO: handle the negative case as well. - if (isKnownNonNegative(AO, Ctxt) && - (isKnownNonNegative(LHS, Ctxt) || isKnownNonNegative(RHS, Ctxt))) - return false; - - return true; -} - GetElementPtrInst * NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, Type *IndexedType) { @@ -381,7 +370,7 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, IndexToSplit = SExt->getOperand(0); } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) { // zext can be treated as sext if the source is non-negative. - if (isKnownNonNegative(ZExt->getOperand(0), GEP)) + if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT)) IndexToSplit = ZExt->getOperand(0); } @@ -389,8 +378,11 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, // If the I-th index needs sext and the underlying add is not equipped with // nsw, we cannot split the add because // sext(LHS + RHS) != sext(LHS) + sext(RHS). - if (requiresSignExtension(IndexToSplit, GEP) && maySignOverflow(AO, GEP)) + if (requiresSignExtension(IndexToSplit, GEP) && + computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) != + OverflowResult::NeverOverflows) return nullptr; + Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); // IndexToSplit = LHS + RHS. if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) @@ -415,7 +407,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( IndexExprs.push_back(SE->getSCEV(*Index)); // Replace the I-th index with LHS. IndexExprs[I] = SE->getSCEV(LHS); - if (isKnownNonNegative(LHS, GEP) && + if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) && DL->getTypeSizeInBits(LHS->getType()) < DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) { // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to @@ -429,19 +421,20 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()), IndexExprs, GEP->isInBounds()); - auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); + Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); if (Candidate == nullptr) return nullptr; - PointerType *TypeOfCandidate = dyn_cast<PointerType>(Candidate->getType()); - // Pretty rare but theoretically possible when a numeric value happens to - // share CandidateExpr. - if (TypeOfCandidate == nullptr) - return nullptr; + IRBuilder<> Builder(GEP); + // Candidate does not necessarily have the same pointer type as GEP. Use + // bitcast or pointer cast to make sure they have the same type, so that the + // later RAUW doesn't complain. + Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType()); + assert(Candidate->getType() == GEP->getType()); // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); - Type *ElementType = TypeOfCandidate->getElementType(); + Type *ElementType = GEP->getType()->getElementType(); uint64_t ElementSize = DL->getTypeAllocSize(ElementType); // Another less rare case: because I is not necessarily the last index of the // GEP, the size of the type at the I-th index (IndexedSize) is not @@ -461,8 +454,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( return nullptr; // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); - IRBuilder<> Builder(GEP); - Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (RHS->getType() != IntPtrTy) RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy); if (IndexedSize != ElementSize) { @@ -476,54 +468,89 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( return NewGEP; } -Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) { Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); - if (auto *NewI = tryReassociateAdd(LHS, RHS, I)) + if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I)) return NewI; - if (auto *NewI = tryReassociateAdd(RHS, LHS, I)) + if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I)) return NewI; return nullptr; } -Instruction *NaryReassociate::tryReassociateAdd(Value *LHS, Value *RHS, - Instruction *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS, + BinaryOperator *I) { Value *A = nullptr, *B = nullptr; - // To be conservative, we reassociate I only when it is the only user of A+B. - if (LHS->hasOneUse() && match(LHS, m_Add(m_Value(A), m_Value(B)))) { - // I = (A + B) + RHS - // = (A + RHS) + B or (B + RHS) + A + // To be conservative, we reassociate I only when it is the only user of (A op + // B). + if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) { + // I = (A op B) op RHS + // = (A op RHS) op B or (B op RHS) op A const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); const SCEV *RHSExpr = SE->getSCEV(RHS); if (BExpr != RHSExpr) { - if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(AExpr, RHSExpr), B, I)) + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I)) return NewI; } if (AExpr != RHSExpr) { - if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(BExpr, RHSExpr), A, I)) + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I)) return NewI; } } return nullptr; } -Instruction *NaryReassociate::tryReassociatedAdd(const SCEV *LHSExpr, - Value *RHS, Instruction *I) { - auto Pos = SeenExprs.find(LHSExpr); - // Bail out if LHSExpr is not previously seen. - if (Pos == SeenExprs.end()) - return nullptr; - +Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr, + Value *RHS, + BinaryOperator *I) { // Look for the closest dominator LHS of I that computes LHSExpr, and replace - // I with LHS + RHS. + // I with LHS op RHS. auto *LHS = findClosestMatchingDominator(LHSExpr, I); if (LHS == nullptr) return nullptr; - Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + Instruction *NewI = nullptr; + switch (I->getOpcode()) { + case Instruction::Add: + NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + break; + case Instruction::Mul: + NewI = BinaryOperator::CreateMul(LHS, RHS, "", I); + break; + default: + llvm_unreachable("Unexpected instruction."); + } NewI->takeName(I); return NewI; } +bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, + Value *&Op2) { + switch (I->getOpcode()) { + case Instruction::Add: + return match(V, m_Add(m_Value(Op1), m_Value(Op2))); + case Instruction::Mul: + return match(V, m_Mul(m_Value(Op1), m_Value(Op2))); + default: + llvm_unreachable("Unexpected instruction."); + } + return false; +} + +const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS, + const SCEV *RHS) { + switch (I->getOpcode()) { + case Instruction::Add: + return SE->getAddExpr(LHS, RHS); + case Instruction::Mul: + return SE->getMulExpr(LHS, RHS); + default: + llvm_unreachable("Unexpected instruction."); + } + return nullptr; +} + Instruction * NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, Instruction *Dominatee) { @@ -537,9 +564,13 @@ NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, // future instruction either. Therefore, we pop it out of the stack. This // optimization makes the algorithm O(n). while (!Candidates.empty()) { - Instruction *Candidate = Candidates.back(); - if (DT->dominates(Candidate, Dominatee)) - return Candidate; + // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed + // during rewriting. + if (Value *Candidate = Candidates.back()) { + Instruction *CandidateInstruction = cast<Instruction>(Candidate); + if (DT->dominates(CandidateInstruction, Dominatee)) + return CandidateInstruction; + } Candidates.pop_back(); } return nullptr; diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 31d7df39c781..9f26f78892c6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -154,7 +154,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, Phi->addIncoming(Call, &CurrBB); Phi->addIncoming(LibCall, LibCallBB); - BB = JoinBB; + BB = JoinBB->getIterator(); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 366301ad731a..28c610c2486a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -27,7 +27,7 @@ // well defined state for inspection by the collector. In the current // implementation, this is done via the insertion of poll sites at method entry // and the backedge of most loops. We try to avoid inserting more polls than -// are neccessary to ensure a finite period between poll sites. This is not +// are necessary to ensure a finite period between poll sites. This is not // because the poll itself is expensive in the generated code; it's not. Polls // do tend to impact the optimizer itself in negative ways; we'd like to avoid // perturbing the optimization of the method as much as we can. @@ -91,13 +91,15 @@ STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); using namespace llvm; -// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// Ignore opportunities to avoid placing safepoints on backedges, useful for // validation static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, cl::init(false)); -/// If true, do not place backedge safepoints in counted loops. -static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); +/// How narrow does the trip count of a loop have to be to have to be considered +/// "counted"? Counted loops do not get safepoints at backedges. +static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width", + cl::Hidden, cl::init(32)); // If true, split the backedge of a loop when placing the safepoint, otherwise // split the latch block itself. Both are useful to support for @@ -121,7 +123,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { std::vector<TerminatorInst *> PollLocations; /// True unless we're running spp-no-calls in which case we need to disable - /// the call dependend placement opts. + /// the call-dependent placement opts. bool CallSafepointsEnabled; ScalarEvolution *SE = nullptr; @@ -142,7 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { } bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); for (auto I = LI->begin(), E = LI->end(); I != E; I++) { @@ -153,7 +155,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); // We no longer modify the IR at all in this pass. Thus all // analysis are preserved. @@ -190,10 +192,8 @@ static void InsertSafepointPoll(Instruction *InsertBefore, std::vector<CallSite> &ParsePointsNeeded /*rval*/); -static bool isGCLeafFunction(const CallSite &CS); - static bool needsStatepoint(const CallSite &CS) { - if (isGCLeafFunction(CS)) + if (callsGCLeafFunction(CS)) return false; if (CS.isCall()) { CallInst *call = cast<CallInst>(CS.getInstruction()); @@ -206,7 +206,7 @@ static bool needsStatepoint(const CallSite &CS) { return true; } -static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); +static Value *ReplaceWithStatepoint(const CallSite &CS); /// Returns true if this loop is known to contain a call safepoint which /// must unconditionally execute on any iteration of the loop which returns @@ -220,7 +220,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, // For the moment, we look only for the 'cuts' that consist of a single call // instruction in a block which is dominated by the Header and dominates the // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain - // of such dominating blocks gets substaintially more occurences than just + // of such dominating blocks gets substantially more occurrences than just // checking the Pred and Header blocks themselves. This may be due to the // density of loop exit conditions caused by range and null checks. // TODO: structure this as an analysis pass, cache the result for subloops, @@ -255,18 +255,12 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, /// conservatism in the analysis. static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, BasicBlock *Pred) { - // Only used when SkipCounted is off - const unsigned upperTripBound = 8192; - // A conservative bound on the loop as a whole. const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); - if (MaxTrips != SE->getCouldNotCompute()) { - if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) - return true; - if (SkipCounted && - SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) - return true; - } + if (MaxTrips != SE->getCouldNotCompute() && + SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( + CountedLoopTripWidth)) + return true; // If this is a conditional branch to the header with the alternate path // being outside the loop, we can ask questions about the execution frequency @@ -275,13 +269,10 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, // This returns an exact expression only. TODO: We really only need an // upper bound here, but SE doesn't expose that. const SCEV *MaxExec = SE->getExitCount(L, Pred); - if (MaxExec != SE->getCouldNotCompute()) { - if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) - return true; - if (SkipCounted && - SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) + if (MaxExec != SE->getCouldNotCompute() && + SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN( + CountedLoopTripWidth)) return true; - } } return /* not finite */ false; @@ -432,14 +423,14 @@ static Instruction *findLocationForEntrySafepoint(Function &F, assert(hasNextInstruction(I) && "first check if there is a next instruction!"); if (I->isTerminator()) { - return I->getParent()->getUniqueSuccessor()->begin(); + return &I->getParent()->getUniqueSuccessor()->front(); } else { - return std::next(BasicBlock::iterator(I)); + return &*++I->getIterator(); } }; Instruction *cursor = nullptr; - for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); + for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor); cursor = nextInstruction(cursor)) { // We need to ensure a safepoint poll occurs before any 'real' call. The @@ -466,7 +457,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F, static void findCallSafepoints(Function &F, std::vector<CallSite> &Found /*rval*/) { assert(Found.empty() && "must be empty!"); - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { Instruction *inst = &I; if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { CallSite CS(inst); @@ -713,7 +704,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { Invoke->getParent()); } - Value *GCResult = ReplaceWithStatepoint(CS, nullptr); + Value *GCResult = ReplaceWithStatepoint(CS); Results.push_back(GCResult); } assert(Results.size() == ParsePointNeeded.size()); @@ -747,7 +738,7 @@ FunctionPass *llvm::createPlaceSafepointsPass() { INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, "place-backedge-safepoints-impl", "Place Backedge Safepoints", false, false) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, @@ -759,31 +750,6 @@ INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", false, false) -static bool isGCLeafFunction(const CallSite &CS) { - Instruction *inst = CS.getInstruction(); - if (isa<IntrinsicInst>(inst)) { - // Most LLVM intrinsics are things which can never take a safepoint. - // As a result, we don't need to have the stack parsable at the - // callsite. This is a highly useful optimization since intrinsic - // calls are fairly prevelent, particularly in debug builds. - return true; - } - - // If this function is marked explicitly as a leaf call, we don't need to - // place a safepoint of it. In fact, for correctness we *can't* in many - // cases. Note: Indirect calls return Null for the called function, - // these obviously aren't runtime functions with attributes - // TODO: Support attributes on the call site as well. - const Function *F = CS.getCalledFunction(); - bool isLeaf = - F && - F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); - if (isLeaf) { - return true; - } - return false; -} - static void InsertSafepointPoll(Instruction *InsertBefore, std::vector<CallSite> &ParsePointsNeeded /*rval*/) { @@ -796,6 +762,7 @@ InsertSafepointPoll(Instruction *InsertBefore, // path call - where we need to insert a safepoint (parsepoint). auto *F = M->getFunction(GCSafepointPollName); + assert(F && "gc.safepoint_poll function is missing"); assert(F->getType()->getElementType() == FunctionType::get(Type::getVoidTy(M->getContext()), false) && "gc.safepoint_poll declared with wrong type"); @@ -864,10 +831,8 @@ InsertSafepointPoll(Instruction *InsertBefore, /// Replaces the given call site (Call or Invoke) with a gc.statepoint /// intrinsic with an empty deoptimization arguments list. This does /// NOT do explicit relocation for GC support. -static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ - Pass *P) { - assert(CS.getInstruction()->getParent()->getParent()->getParent() && - "must be set"); +static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) { + assert(CS.getInstruction()->getModule() && "must be set"); // TODO: technically, a pass is not allowed to get functions from within a // function pass since it might trigger a new function addition. Refactor @@ -917,15 +882,10 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ CS.getInstruction()->getContext(), AttributeSet::FunctionIndex, AttrsToRemove); - Value *StatepointTarget = NumPatchBytes == 0 - ? CS.getCalledValue() - : ConstantPointerNull::get(cast<PointerType>( - CS.getCalledValue()->getType())); - if (CS.isCall()) { CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); CallInst *Call = Builder.CreateGCStatepointCall( - ID, NumPatchBytes, StatepointTarget, + ID, NumPatchBytes, CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, "safepoint_token"); Call->setTailCall(ToReplace->isTailCall()); @@ -938,7 +898,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ Token = Call; - // Put the following gc_result and gc_relocate calls immediately after the + // Put the following gc_result and gc_relocate calls immediately after // the old call (which we're about to delete). assert(ToReplace->getNextNode() && "not a terminator, must have next"); Builder.SetInsertPoint(ToReplace->getNextNode()); @@ -951,7 +911,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ // original block. Builder.SetInsertPoint(ToReplace->getParent()); InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( - ID, NumPatchBytes, StatepointTarget, ToReplace->getNormalDest(), + ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(), ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, "safepoint_token"); @@ -967,7 +927,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ // We'll insert the gc.result into the normal block BasicBlock *NormalDest = ToReplace->getNormalDest(); // Can not insert gc.result in case of phi nodes preset. - // Should have removed this cases prior to runnning this function + // Should have removed this cases prior to running this function assert(!isa<PHINode>(NormalDest->begin())); Instruction *IP = &*(NormalDest->getFirstInsertionPt()); Builder.SetInsertPoint(IP); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index d1acf785d07e..fb970c747ce1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -26,6 +26,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -62,7 +64,7 @@ namespace { /// Print out the expression identified in the Ops list. /// static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { - Module *M = I->getParent()->getParent()->getParent(); + Module *M = I->getModule(); dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " << *Ops[0].Op->getType() << '\t'; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { @@ -82,20 +84,6 @@ namespace { Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} - /// \brief Sort factors by their Base. - struct BaseSorter { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Base < RHS.Base; - } - }; - - /// \brief Compare factors for equal bases. - struct BaseEqual { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Base == RHS.Base; - } - }; - /// \brief Sort factors in descending order by their power. struct PowerDescendingSorter { bool operator()(const Factor &LHS, const Factor &RHS) { @@ -172,6 +160,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); } private: void BuildRankMap(Function &F); @@ -255,27 +244,6 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, return nullptr; } -static bool isUnmovableInstruction(Instruction *I) { - switch (I->getOpcode()) { - case Instruction::PHI: - case Instruction::LandingPad: - case Instruction::Alloca: - case Instruction::Load: - case Instruction::Invoke: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - return true; - case Instruction::Call: - return !isa<DbgInfoIntrinsic>(I); - default: - return false; - } -} - void Reassociate::BuildRankMap(Function &F) { unsigned i = 2; @@ -295,7 +263,7 @@ void Reassociate::BuildRankMap(Function &F) { // we cannot move. This ensures that the ranks for these instructions are // all different in the block. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (isUnmovableInstruction(I)) + if (mayBeMemoryDependent(*I)) ValueRankMap[&*I] = ++BBRank; } } @@ -913,7 +881,11 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, /// that computes the negative version of the value specified. The negative /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. -static Value *NegateValue(Value *V, Instruction *BI) { +/// Also add intermediate instructions to the redo list that are modified while +/// pushing the negates through adds. These will be revisited to see if +/// additional opportunities have been exposed. +static Value *NegateValue(Value *V, Instruction *BI, + SetVector<AssertingVH<Instruction>> &ToRedo) { if (Constant *C = dyn_cast<Constant>(V)) { if (C->getType()->isFPOrFPVectorTy()) { return ConstantExpr::getFNeg(C); @@ -934,8 +906,8 @@ static Value *NegateValue(Value *V, Instruction *BI) { if (BinaryOperator *I = isReassociableOp(V, Instruction::Add, Instruction::FAdd)) { // Push the negates through the add. - I->setOperand(0, NegateValue(I->getOperand(0), BI)); - I->setOperand(1, NegateValue(I->getOperand(1), BI)); + I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo)); + I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo)); if (I->getOpcode() == Instruction::Add) { I->setHasNoUnsignedWrap(false); I->setHasNoSignedWrap(false); @@ -948,6 +920,10 @@ static Value *NegateValue(Value *V, Instruction *BI) { // I->moveBefore(BI); I->setName(I->getName()+".neg"); + + // Add the intermediate negates to the redo list as processing them later + // could expose more reassociating opportunities. + ToRedo.insert(I); return I; } @@ -972,26 +948,28 @@ static Value *NegateValue(Value *V, Instruction *BI) { if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { InsertPt = II->getNormalDest()->begin(); } else { - InsertPt = InstInput; - ++InsertPt; + InsertPt = ++InstInput->getIterator(); } while (isa<PHINode>(InsertPt)) ++InsertPt; } else { InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); } - TheNeg->moveBefore(InsertPt); + TheNeg->moveBefore(&*InsertPt); if (TheNeg->getOpcode() == Instruction::Sub) { TheNeg->setHasNoUnsignedWrap(false); TheNeg->setHasNoSignedWrap(false); } else { TheNeg->andIRFlags(BI); } + ToRedo.insert(TheNeg); return TheNeg; } // Insert a 'neg' instruction that subtracts the value from zero to get the // negation. - return CreateNeg(V, V->getName() + ".neg", BI, BI); + BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); + ToRedo.insert(NewNeg); + return NewNeg; } /// Return true if we should break up this subtract of X-Y into (X + -Y). @@ -1025,14 +1003,15 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { /// If we have (X-Y), and if either X is an add, or if this is only used by an /// add, transform this into (X+(0-Y)) to promote better reassociation. -static BinaryOperator *BreakUpSubtract(Instruction *Sub) { +static BinaryOperator * +BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // // Calculate the negative value of Operand 1 of the sub instruction, // and set it as the RHS of the add instruction we just made. // - Value *NegVal = NegateValue(Sub->getOperand(1), Sub); + Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. @@ -1166,7 +1145,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { return nullptr; } - BasicBlock::iterator InsertPt = BO; ++InsertPt; + BasicBlock::iterator InsertPt = ++BO->getIterator(); // If this was just a single multiply, remove the multiply and return the only // remaining operand. @@ -1179,7 +1158,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { } if (NeedsNegate) - V = CreateNeg(V, "neg", InsertPt, BO); + V = CreateNeg(V, "neg", &*InsertPt, BO); return V; } @@ -1250,7 +1229,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode, return nullptr; } -/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and +/// Helper function of CombineXorOpnd(). It creates a bitwise-and /// instruction with the given two operands, and return the resulting /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will @@ -2083,7 +2062,7 @@ void Reassociate::OptimizeInst(Instruction *I) { return; // Don't optimize floating point instructions that don't have unsafe algebra. - if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra()) + if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) return; // Do not reassociate boolean (i1) expressions. We want to preserve the @@ -2099,7 +2078,7 @@ void Reassociate::OptimizeInst(Instruction *I) { // see if we can convert it to X+-Y. if (I->getOpcode() == Instruction::Sub) { if (ShouldBreakUpSubtract(I)) { - Instruction *NI = BreakUpSubtract(I); + Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2110,6 +2089,12 @@ void Reassociate::OptimizeInst(Instruction *I) { (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::Mul))) { Instruction *NI = LowerNegateToMultiply(I); + // If the negate was simplified, revisit the users to see if we can + // reassociate further. + for (User *U : NI->users()) { + if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) + RedoInsts.insert(Tmp); + } RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2117,7 +2102,7 @@ void Reassociate::OptimizeInst(Instruction *I) { } } else if (I->getOpcode() == Instruction::FSub) { if (ShouldBreakUpSubtract(I)) { - Instruction *NI = BreakUpSubtract(I); + Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2127,7 +2112,13 @@ void Reassociate::OptimizeInst(Instruction *I) { if (isReassociableOp(I->getOperand(1), Instruction::FMul) && (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::FMul))) { + // If the negate was simplified, revisit the users to see if we can + // reassociate further. Instruction *NI = LowerNegateToMultiply(I); + for (User *U : NI->users()) { + if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) + RedoInsts.insert(Tmp); + } RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2142,8 +2133,14 @@ void Reassociate::OptimizeInst(Instruction *I) { // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. unsigned Opcode = BO->getOpcode(); - if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) + if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) { + // During the initial run we will get to the root of the tree. + // But if we get here while we are redoing instructions, there is no + // guarantee that the root will be visited. So Redo later + if (BO->user_back() != BO) + RedoInsts.insert(BO->user_back()); return; + } // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. @@ -2250,10 +2247,10 @@ bool Reassociate::runOnFunction(Function &F) { for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Optimize every instruction in the basic block. for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) - if (isInstructionTriviallyDead(II)) { - EraseInst(II++); + if (isInstructionTriviallyDead(&*II)) { + EraseInst(&*II++); } else { - OptimizeInst(II); + OptimizeInst(&*II); assert(II->getParent() == BI && "Moved to a different block!"); ++II; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 1b46727c17bb..915f89780c08 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -82,10 +82,9 @@ bool RegToMem::runOnFunction(Function &F) { BasicBlock::iterator I = BBEntry->begin(); while (isa<AllocaInst>(I)) ++I; - CastInst *AllocaInsertionPoint = - new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), - Type::getInt32Ty(F.getContext()), - "reg2mem alloca point", I); + CastInst *AllocaInsertionPoint = new BitCastInst( + Constant::getNullValue(Type::getInt32Ty(F.getContext())), + Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I); // Find the escaped instructions. But don't create stack slots for // allocas in entry block. @@ -95,7 +94,7 @@ bool RegToMem::runOnFunction(Function &F) { for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); iib != iie; ++iib) { if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && - valueEscapes(iib)) { + valueEscapes(&*iib)) { WorkList.push_front(&*iib); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index ae2ae3af0c7a..db127c3f7b4e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -14,12 +14,14 @@ #include "llvm/Pass.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/MapVector.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" @@ -46,10 +48,6 @@ using namespace llvm; -// Print tracing output -static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, - cl::init(false)); - // Print the liveset found at the insert location static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, cl::init(false)); @@ -74,6 +72,12 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", cl::location(ClobberNonLive), cl::Hidden); +static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden, + cl::init(false)); +static cl::opt<bool> + AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", + cl::Hidden, cl::init(true)); + namespace { struct RewriteStatepointsForGC : public ModulePass { static char ID; // Pass identification, replacement for typeid @@ -88,10 +92,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripDereferenceabilityInfo asserts that shouldRewriteStatepointsIn + // stripNonValidAttributes asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripDereferenceabilityInfo(M); + stripNonValidAttributes(M); } return Changed; @@ -108,15 +112,16 @@ struct RewriteStatepointsForGC : public ModulePass { /// dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripDereferenceabilityInfo (conservatively) restores correctness + /// heap. stripNonValidAttributes (conservatively) restores correctness /// by erasing all attributes in the module that externally imply /// dereferenceability. - /// - void stripDereferenceabilityInfo(Module &M); + /// Similar reasoning also applies to the noalias attributes. gc.statepoint + /// can touch the entire heap including noalias objects. + void stripNonValidAttributes(Module &M); - // Helpers for stripDereferenceabilityInfo - void stripDereferenceabilityInfoFromBody(Function &F); - void stripDereferenceabilityInfoFromPrototype(Function &F); + // Helpers for stripNonValidAttributes + void stripNonValidAttributesFromBody(Function &F); + void stripNonValidAttributesFromPrototype(Function &F); }; } // namespace @@ -160,15 +165,16 @@ struct GCPtrLivenessData { // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type typedef DenseMap<Value *, Value *> DefiningValueMapTy; -typedef DenseSet<llvm::Value *> StatepointLiveSetTy; -typedef DenseMap<Instruction *, Value *> RematerializedValueMapTy; +typedef DenseSet<Value *> StatepointLiveSetTy; +typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>> + RematerializedValueMapTy; struct PartiallyConstructedSafepointRecord { - /// The set of values known to be live accross this safepoint - StatepointLiveSetTy liveset; + /// The set of values known to be live across this safepoint + StatepointLiveSetTy LiveSet; /// Mapping from live pointers to a base-defining-value - DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + DenseMap<Value *, Value *> PointerToBase; /// The *new* gc.statepoint instruction itself. This produces the token /// that normal path gc.relocates and the gc.result are tied to. @@ -179,12 +185,26 @@ struct PartiallyConstructedSafepointRecord { Instruction *UnwindToken; /// Record live values we are rematerialized instead of relocating. - /// They are not included into 'liveset' field. + /// They are not included into 'LiveSet' field. /// Maps rematerialized copy to it's original value. RematerializedValueMapTy RematerializedValues; }; } +static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) { + assert(UseDeoptBundles && "Should not be called otherwise!"); + + Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt"); + + if (!DeoptBundle.hasValue()) { + assert(AllowStatepointWithNoDeoptInfo && + "Found non-leaf call without deopt info!"); + return None; + } + + return DeoptBundle.getValue().Inputs; +} + /// Compute the live-in set for every basic block in the function static void computeLiveInValues(DominatorTree &DT, Function &F, GCPtrLivenessData &Data); @@ -195,10 +215,10 @@ static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data, StatepointLiveSetTy &out); // TODO: Once we can get to the GCStrategy, this becomes -// Optional<bool> isGCManagedPointer(const Value *V) const override { +// Optional<bool> isGCManagedPointer(const Type *Ty) const override { -static bool isGCPointerType(const Type *T) { - if (const PointerType *PT = dyn_cast<PointerType>(T)) +static bool isGCPointerType(Type *T) { + if (auto *PT = dyn_cast<PointerType>(T)) // For the sake of this example GC, we arbitrarily pick addrspace(1) as our // GC managed heap. We know that a pointer into this heap needs to be // updated and that no other pointer does. @@ -233,9 +253,8 @@ static bool containsGCPtrType(Type *Ty) { if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) return containsGCPtrType(AT->getElementType()); if (StructType *ST = dyn_cast<StructType>(Ty)) - return std::any_of( - ST->subtypes().begin(), ST->subtypes().end(), - [](Type *SubType) { return containsGCPtrType(SubType); }); + return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), + containsGCPtrType); return false; } @@ -247,7 +266,7 @@ static bool isUnhandledGCPointerType(Type *Ty) { } #endif -static bool order_by_name(llvm::Value *a, llvm::Value *b) { +static bool order_by_name(Value *a, Value *b) { if (a->hasName() && b->hasName()) { return -1 == a->getName().compare(b->getName()); } else if (a->hasName() && !b->hasName()) { @@ -260,6 +279,13 @@ static bool order_by_name(llvm::Value *a, llvm::Value *b) { } } +// Return the name of the value suffixed with the provided value, or if the +// value didn't have a name, the default value specified. +static std::string suffixed_name_or(Value *V, StringRef Suffix, + StringRef DefaultName) { + return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str(); +} + // Conservatively identifies any definitions which might be live at the // given instruction. The analysis is performed immediately before the // given instruction. Values defined by that instruction are not considered @@ -269,30 +295,56 @@ static void analyzeParsePointLiveness( const CallSite &CS, PartiallyConstructedSafepointRecord &result) { Instruction *inst = CS.getInstruction(); - StatepointLiveSetTy liveset; - findLiveSetAtInst(inst, OriginalLivenessData, liveset); + StatepointLiveSetTy LiveSet; + findLiveSetAtInst(inst, OriginalLivenessData, LiveSet); if (PrintLiveSet) { // Note: This output is used by several of the test cases - // The order of elemtns in a set is not stable, put them in a vec and sort + // The order of elements in a set is not stable, put them in a vec and sort // by name - SmallVector<Value *, 64> temp; - temp.insert(temp.end(), liveset.begin(), liveset.end()); - std::sort(temp.begin(), temp.end(), order_by_name); + SmallVector<Value *, 64> Temp; + Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end()); + std::sort(Temp.begin(), Temp.end(), order_by_name); errs() << "Live Variables:\n"; - for (Value *V : temp) { - errs() << " " << V->getName(); // no newline - V->dump(); - } + for (Value *V : Temp) + dbgs() << " " << V->getName() << " " << *V << "\n"; } if (PrintLiveSetSize) { errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; - errs() << "Number live values: " << liveset.size() << "\n"; + errs() << "Number live values: " << LiveSet.size() << "\n"; + } + result.LiveSet = LiveSet; +} + +static bool isKnownBaseResult(Value *V); +namespace { +/// A single base defining value - An immediate base defining value for an +/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. +/// For instructions which have multiple pointer [vector] inputs or that +/// transition between vector and scalar types, there is no immediate base +/// defining value. The 'base defining value' for 'Def' is the transitive +/// closure of this relation stopping at the first instruction which has no +/// immediate base defining value. The b.d.v. might itself be a base pointer, +/// but it can also be an arbitrary derived pointer. +struct BaseDefiningValueResult { + /// Contains the value which is the base defining value. + Value * const BDV; + /// True if the base defining value is also known to be an actual base + /// pointer. + const bool IsKnownBase; + BaseDefiningValueResult(Value *BDV, bool IsKnownBase) + : BDV(BDV), IsKnownBase(IsKnownBase) { +#ifndef NDEBUG + // Check consistency between new and old means of checking whether a BDV is + // a base. + bool MustBeBase = isKnownBaseResult(BDV); + assert(!MustBeBase || MustBeBase == IsKnownBase); +#endif } - result.liveset = liveset; +}; } -static Value *findBaseDefiningValue(Value *I); +static BaseDefiningValueResult findBaseDefiningValue(Value *I); /// Return a base defining value for the 'Index' element of the given vector /// instruction 'I'. If Index is null, returns a BDV for the entire vector @@ -303,8 +355,8 @@ static Value *findBaseDefiningValue(Value *I); /// vector returned is a BDV (and possibly a base) of the entire vector 'I'. /// If the later, the return pointer is a BDV (or possibly a base) for the /// particular element in 'I'. -static std::pair<Value *, bool> -findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { +static BaseDefiningValueResult +findBaseDefiningValueOfVector(Value *I) { assert(I->getType()->isVectorTy() && cast<VectorType>(I->getType())->getElementType()->isPointerTy() && "Illegal to ask for the base pointer of a non-pointer type"); @@ -314,7 +366,7 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { if (isa<Argument>(I)) // An incoming argument to the function is a base pointer - return std::make_pair(I, true); + return BaseDefiningValueResult(I, true); // We shouldn't see the address of a global as a vector value? assert(!isa<GlobalVariable>(I) && @@ -325,7 +377,7 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { if (isa<UndefValue>(I)) // utterly meaningless, but useful for dealing with partially optimized // code. - return std::make_pair(I, true); + return BaseDefiningValueResult(I, true); // Due to inheritance, this must be _after_ the global variable and undef // checks @@ -333,31 +385,17 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && "order of checks wrong!"); assert(Con->isNullValue() && "null is the only case which makes sense"); - return std::make_pair(Con, true); + return BaseDefiningValueResult(Con, true); } if (isa<LoadInst>(I)) - return std::make_pair(I, true); - - // For an insert element, we might be able to look through it if we know - // something about the indexes. - if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(I)) { - if (Index) { - Value *InsertIndex = IEI->getOperand(2); - // This index is inserting the value, look for its BDV - if (InsertIndex == Index) - return std::make_pair(findBaseDefiningValue(IEI->getOperand(1)), false); - // Both constant, and can't be equal per above. This insert is definitely - // not relevant, look back at the rest of the vector and keep trying. - if (isa<ConstantInt>(Index) && isa<ConstantInt>(InsertIndex)) - return findBaseDefiningValueOfVector(IEI->getOperand(0), Index); - } - + return BaseDefiningValueResult(I, true); + + if (isa<InsertElementInst>(I)) // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. - return std::make_pair(IEI, false); - } + return BaseDefiningValueResult(I, false); if (isa<ShuffleVectorInst>(I)) // We don't know whether this vector contains entirely base pointers or @@ -365,105 +403,62 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { // duplicate code as needed to construct a parallel vector of bases. // TODO: There a number of local optimizations which could be applied here // for particular sufflevector patterns. - return std::make_pair(I, false); + return BaseDefiningValueResult(I, false); // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "unknown vector instruction - no base found for vector element"); - return std::make_pair(I, false); + return BaseDefiningValueResult(I, false); } -static bool isKnownBaseResult(Value *V); - /// Helper function for findBasePointer - Will return a value which either a) -/// defines the base pointer for the input or b) blocks the simple search -/// (i.e. a PHI or Select of two derived pointers) -static Value *findBaseDefiningValue(Value *I) { +/// defines the base pointer for the input, b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers), or c) involves a change +/// from pointer to vector type or back. +static BaseDefiningValueResult findBaseDefiningValue(Value *I) { if (I->getType()->isVectorTy()) - return findBaseDefiningValueOfVector(I).first; + return findBaseDefiningValueOfVector(I); assert(I->getType()->isPointerTy() && "Illegal to ask for the base pointer of a non-pointer type"); - // This case is a bit of a hack - it only handles extracts from vectors which - // trivially contain only base pointers or cases where we can directly match - // the index of the original extract element to an insertion into the vector. - // See note inside the function for how to improve this. - if (auto *EEI = dyn_cast<ExtractElementInst>(I)) { - Value *VectorOperand = EEI->getVectorOperand(); - Value *Index = EEI->getIndexOperand(); - std::pair<Value *, bool> pair = - findBaseDefiningValueOfVector(VectorOperand, Index); - Value *VectorBase = pair.first; - if (VectorBase->getType()->isPointerTy()) - // We found a BDV for this specific element with the vector. This is an - // optimization, but in practice it covers most of the useful cases - // created via scalarization. - return VectorBase; - else { - assert(VectorBase->getType()->isVectorTy()); - if (pair.second) - // If the entire vector returned is known to be entirely base pointers, - // then the extractelement is valid base for this value. - return EEI; - else { - // Otherwise, we have an instruction which potentially produces a - // derived pointer and we need findBasePointers to clone code for us - // such that we can create an instruction which produces the - // accompanying base pointer. - // Note: This code is currently rather incomplete. We don't currently - // support the general form of shufflevector of insertelement. - // Conceptually, these are just 'base defining values' of the same - // variety as phi or select instructions. We need to update the - // findBasePointers algorithm to insert new 'base-only' versions of the - // original instructions. This is relative straight forward to do, but - // the case which would motivate the work hasn't shown up in real - // workloads yet. - assert((isa<PHINode>(VectorBase) || isa<SelectInst>(VectorBase)) && - "need to extend findBasePointers for generic vector" - "instruction cases"); - return VectorBase; - } - } - } - if (isa<Argument>(I)) // An incoming argument to the function is a base pointer // We should have never reached here if this argument isn't an gc value - return I; + return BaseDefiningValueResult(I, true); if (isa<GlobalVariable>(I)) // base case - return I; + return BaseDefiningValueResult(I, true); // inlining could possibly introduce phi node that contains // undef if callee has multiple returns if (isa<UndefValue>(I)) // utterly meaningless, but useful for dealing with // partially optimized code. - return I; + return BaseDefiningValueResult(I, true); // Due to inheritance, this must be _after_ the global variable and undef // checks - if (Constant *Con = dyn_cast<Constant>(I)) { + if (isa<Constant>(I)) { assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && "order of checks wrong!"); - // Note: Finding a constant base for something marked for relocation - // doesn't really make sense. The most likely case is either a) some - // screwed up the address space usage or b) your validating against - // compiled C++ code w/o the proper separation. The only real exception - // is a null pointer. You could have generic code written to index of - // off a potentially null value and have proven it null. We also use - // null pointers in dead paths of relocation phis (which we might later - // want to find a base pointer for). - assert(isa<ConstantPointerNull>(Con) && - "null is the only case which makes sense"); - return Con; + // Note: Even for frontends which don't have constant references, we can + // see constants appearing after optimizations. A simple example is + // specialization of an address computation on null feeding into a merge + // point where the actual use of the now-constant input is protected by + // another null check. (e.g. test4 in constants.ll) + return BaseDefiningValueResult(I, true); } if (CastInst *CI = dyn_cast<CastInst>(I)) { Value *Def = CI->stripPointerCasts(); + // If stripping pointer casts changes the address space there is an + // addrspacecast in between. + assert(cast<PointerType>(Def->getType())->getAddressSpace() == + cast<PointerType>(CI->getType())->getAddressSpace() && + "unsupported addrspacecast"); // If we find a cast instruction here, it means we've found a cast which is // not simply a pointer cast (i.e. an inttoptr). We don't know how to // handle int->ptr conversion. @@ -472,7 +467,9 @@ static Value *findBaseDefiningValue(Value *I) { } if (isa<LoadInst>(I)) - return I; // The value loaded is an gc base itself + // The value loaded is an gc base itself + return BaseDefiningValueResult(I, true); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) // The base of this GEP is the base @@ -480,14 +477,11 @@ static Value *findBaseDefiningValue(Value *I) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { - case Intrinsic::experimental_gc_result_ptr: default: // fall through to general call handling break; case Intrinsic::experimental_gc_statepoint: - case Intrinsic::experimental_gc_result_float: - case Intrinsic::experimental_gc_result_int: - llvm_unreachable("these don't produce pointers"); + llvm_unreachable("statepoints don't produce pointers"); case Intrinsic::experimental_gc_relocate: { // Rerunning safepoint insertion after safepoints are already // inserted is not supported. It could probably be made to work, @@ -506,17 +500,17 @@ static Value *findBaseDefiningValue(Value *I) { // pointers. This should probably be generalized via attributes to support // both source language and internal functions. if (isa<CallInst>(I) || isa<InvokeInst>(I)) - return I; + return BaseDefiningValueResult(I, true); // I have absolutely no idea how to implement this part yet. It's not - // neccessarily hard, I just haven't really looked at it yet. + // necessarily hard, I just haven't really looked at it yet. assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); if (isa<AtomicCmpXchgInst>(I)) // A CAS is effectively a atomic store and load combined under a // predicate. From the perspective of base pointers, we just treat it // like a load. - return I; + return BaseDefiningValueResult(I, true); assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are " "binary ops which don't apply to pointers"); @@ -525,34 +519,41 @@ static Value *findBaseDefiningValue(Value *I) { // stack, but in either case, this is simply a field load. As a result, // this is a defining definition of the base just like a load is. if (isa<ExtractValueInst>(I)) - return I; + return BaseDefiningValueResult(I, true); // We should never see an insert vector since that would require we be // tracing back a struct value not a pointer value. assert(!isa<InsertValueInst>(I) && "Base pointer for a struct is meaningless"); + // An extractelement produces a base result exactly when it's input does. + // We may need to insert a parallel instruction to extract the appropriate + // element out of the base vector corresponding to the input. Given this, + // it's analogous to the phi and select case even though it's not a merge. + if (isa<ExtractElementInst>(I)) + // Note: There a lot of obvious peephole cases here. This are deliberately + // handled after the main base pointer inference algorithm to make writing + // test cases to exercise that code easier. + return BaseDefiningValueResult(I, false); + // The last two cases here don't return a base pointer. Instead, they - // return a value which dynamically selects from amoung several base + // return a value which dynamically selects from among several base // derived pointers (each with it's own base potentially). It's the job of // the caller to resolve these. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "missing instruction case in findBaseDefiningValing"); - return I; + return BaseDefiningValueResult(I, false); } /// Returns the base defining value for this value. static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { Value *&Cached = Cache[I]; if (!Cached) { - Cached = findBaseDefiningValue(I); + Cached = findBaseDefiningValue(I).BDV; + DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " + << Cached->getName() << "\n"); } assert(Cache[I] != nullptr); - - if (TraceLSP) { - dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() - << "\n"; - } return Cached; } @@ -572,7 +573,9 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, /// is it known to be a base pointer? Or do we need to continue searching. static bool isKnownBaseResult(Value *V) { - if (!isa<PHINode>(V) && !isa<SelectInst>(V)) { + if (!isa<PHINode>(V) && !isa<SelectInst>(V) && + !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) && + !isa<ShuffleVectorInst>(V)) { // no recursion possible return true; } @@ -587,17 +590,19 @@ static bool isKnownBaseResult(Value *V) { return false; } -// TODO: find a better name for this namespace { -class PhiState { +/// Models the state of a single base defining value in the findBasePointer +/// algorithm for determining where a new instruction is needed to propagate +/// the base of this BDV. +class BDVState { public: enum Status { Unknown, Base, Conflict }; - PhiState(Status s, Value *b = nullptr) : status(s), base(b) { + BDVState(Status s, Value *b = nullptr) : status(s), base(b) { assert(status != Base || b); } - PhiState(Value *b) : status(Base), base(b) {} - PhiState() : status(Unknown), base(nullptr) {} + explicit BDVState(Value *b) : status(Base), base(b) {} + BDVState() : status(Unknown), base(nullptr) {} Status getStatus() const { return status; } Value *getBase() const { return base; } @@ -606,72 +611,80 @@ public: bool isUnknown() const { return getStatus() == Unknown; } bool isConflict() const { return getStatus() == Conflict; } - bool operator==(const PhiState &other) const { + bool operator==(const BDVState &other) const { return base == other.base && status == other.status; } - bool operator!=(const PhiState &other) const { return !(*this == other); } + bool operator!=(const BDVState &other) const { return !(*this == other); } - void dump() { - errs() << status << " (" << base << " - " - << (base ? base->getName() : "nullptr") << "): "; + LLVM_DUMP_METHOD + void dump() const { print(dbgs()); dbgs() << '\n'; } + + void print(raw_ostream &OS) const { + switch (status) { + case Unknown: + OS << "U"; + break; + case Base: + OS << "B"; + break; + case Conflict: + OS << "C"; + break; + }; + OS << " (" << base << " - " + << (base ? base->getName() : "nullptr") << "): "; } private: Status status; - Value *base; // non null only if status == base + AssertingVH<Value> base; // non null only if status == base }; +} -typedef DenseMap<Value *, PhiState> ConflictStateMapTy; -// Values of type PhiState form a lattice, and this is a helper +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { + State.print(OS); + return OS; +} +#endif + +namespace { +// Values of type BDVState form a lattice, and this is a helper // class that implementes the meet operation. The meat of the meet -// operation is implemented in MeetPhiStates::pureMeet -class MeetPhiStates { +// operation is implemented in MeetBDVStates::pureMeet +class MeetBDVStates { public: - // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. - explicit MeetPhiStates(const ConflictStateMapTy &phiStates) - : phiStates(phiStates) {} - - // Destructively meet the current result with the base V. V can - // either be a merge instruction (SelectInst / PHINode), in which - // case its status is looked up in the phiStates map; or a regular - // SSA value, in which case it is assumed to be a base. - void meetWith(Value *V) { - PhiState otherState = getStateForBDV(V); - assert((MeetPhiStates::pureMeet(otherState, currentResult) == - MeetPhiStates::pureMeet(currentResult, otherState)) && - "math is wrong: meet does not commute!"); - currentResult = MeetPhiStates::pureMeet(otherState, currentResult); + /// Initializes the currentResult to the TOP state so that if can be met with + /// any other state to produce that state. + MeetBDVStates() {} + + // Destructively meet the current result with the given BDVState + void meetWith(BDVState otherState) { + currentResult = meet(otherState, currentResult); } - PhiState getResult() const { return currentResult; } + BDVState getResult() const { return currentResult; } private: - const ConflictStateMapTy &phiStates; - PhiState currentResult; - - /// Return a phi state for a base defining value. We'll generate a new - /// base state for known bases and expect to find a cached state otherwise - PhiState getStateForBDV(Value *baseValue) { - if (isKnownBaseResult(baseValue)) { - return PhiState(baseValue); - } else { - return lookupFromMap(baseValue); - } - } + BDVState currentResult; - PhiState lookupFromMap(Value *V) { - auto I = phiStates.find(V); - assert(I != phiStates.end() && "lookup failed!"); - return I->second; + /// Perform a meet operation on two elements of the BDVState lattice. + static BDVState meet(BDVState LHS, BDVState RHS) { + assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) && + "math is wrong: meet does not commute!"); + BDVState Result = pureMeet(LHS, RHS); + DEBUG(dbgs() << "meet of " << LHS << " with " << RHS + << " produced " << Result << "\n"); + return Result; } - static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { + static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) { switch (stateA.getStatus()) { - case PhiState::Unknown: + case BDVState::Unknown: return stateB; - case PhiState::Base: + case BDVState::Base: assert(stateA.getBase() && "can't be null"); if (stateB.isUnknown()) return stateA; @@ -681,18 +694,20 @@ private: assert(stateA == stateB && "equality broken!"); return stateA; } - return PhiState(PhiState::Conflict); + return BDVState(BDVState::Conflict); } assert(stateB.isConflict() && "only three states!"); - return PhiState(PhiState::Conflict); + return BDVState(BDVState::Conflict); - case PhiState::Conflict: + case BDVState::Conflict: return stateA; } llvm_unreachable("only three states!"); } }; } + + /// For a given value or instruction, figure out what base ptr it's derived /// from. For gc objects, this is simply itself. On success, returns a value /// which is the base pointer. (This is reliable and can be used for @@ -723,171 +738,252 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // // Note: A simpler form of this would be to add the conflict form of all // PHIs without running the optimistic algorithm. This would be - // analougous to pessimistic data flow and would likely lead to an + // analogous to pessimistic data flow and would likely lead to an // overall worse solution. - ConflictStateMapTy states; - states[def] = PhiState(); - // Recursively fill in all phis & selects reachable from the initial one - // for which we don't already know a definite base value for - // TODO: This should be rewritten with a worklist - bool done = false; - while (!done) { - done = true; - // Since we're adding elements to 'states' as we run, we can't keep - // iterators into the set. - SmallVector<Value *, 16> Keys; - Keys.reserve(states.size()); - for (auto Pair : states) { - Value *V = Pair.first; - Keys.push_back(V); - } - for (Value *v : Keys) { - assert(!isKnownBaseResult(v) && "why did it get added?"); - if (PHINode *phi = dyn_cast<PHINode>(v)) { - assert(phi->getNumIncomingValues() > 0 && - "zero input phis are illegal"); - for (Value *InVal : phi->incoming_values()) { - Value *local = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } - } - } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { - Value *local = findBaseOrBDV(sel->getTrueValue(), cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } - local = findBaseOrBDV(sel->getFalseValue(), cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } +#ifndef NDEBUG + auto isExpectedBDVType = [](Value *BDV) { + return isa<PHINode>(BDV) || isa<SelectInst>(BDV) || + isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV); + }; +#endif + + // Once populated, will contain a mapping from each potentially non-base BDV + // to a lattice value (described above) which corresponds to that BDV. + // We use the order of insertion (DFS over the def/use graph) to provide a + // stable deterministic ordering for visiting DenseMaps (which are unordered) + // below. This is important for deterministic compilation. + MapVector<Value *, BDVState> States; + + // Recursively fill in all base defining values reachable from the initial + // one for which we don't already know a definite base value for + /* scope */ { + SmallVector<Value*, 16> Worklist; + Worklist.push_back(def); + States.insert(std::make_pair(def, BDVState())); + while (!Worklist.empty()) { + Value *Current = Worklist.pop_back_val(); + assert(!isKnownBaseResult(Current) && "why did it get added?"); + + auto visitIncomingValue = [&](Value *InVal) { + Value *Base = findBaseOrBDV(InVal, cache); + if (isKnownBaseResult(Base)) + // Known bases won't need new instructions introduced and can be + // ignored safely + return; + assert(isExpectedBDVType(Base) && "the only non-base values " + "we see should be base defining values"); + if (States.insert(std::make_pair(Base, BDVState())).second) + Worklist.push_back(Base); + }; + if (PHINode *Phi = dyn_cast<PHINode>(Current)) { + for (Value *InVal : Phi->incoming_values()) + visitIncomingValue(InVal); + } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) { + visitIncomingValue(Sel->getTrueValue()); + visitIncomingValue(Sel->getFalseValue()); + } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) { + visitIncomingValue(EE->getVectorOperand()); + } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) { + visitIncomingValue(IE->getOperand(0)); // vector operand + visitIncomingValue(IE->getOperand(1)); // scalar operand + } else { + // There is one known class of instructions we know we don't handle. + assert(isa<ShuffleVectorInst>(Current)); + llvm_unreachable("unimplemented instruction case"); } } } - if (TraceLSP) { - errs() << "States after initialization:\n"; - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; - state.dump(); - v->dump(); - } +#ifndef NDEBUG + DEBUG(dbgs() << "States after initialization:\n"); + for (auto Pair : States) { + DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } +#endif - // TODO: come back and revisit the state transitions around inputs which - // have reached conflict state. The current version seems too conservative. + // Return a phi state for a base defining value. We'll generate a new + // base state for known bases and expect to find a cached state otherwise. + auto getStateForBDV = [&](Value *baseValue) { + if (isKnownBaseResult(baseValue)) + return BDVState(baseValue); + auto I = States.find(baseValue); + assert(I != States.end() && "lookup failed!"); + return I->second; + }; bool progress = true; while (progress) { #ifndef NDEBUG - size_t oldSize = states.size(); + const size_t oldSize = States.size(); #endif progress = false; - // We're only changing keys in this loop, thus safe to keep iterators - for (auto Pair : states) { - MeetPhiStates calculateMeet(states); - Value *v = Pair.first; - assert(!isKnownBaseResult(v) && "why did it get added?"); - if (SelectInst *select = dyn_cast<SelectInst>(v)) { - calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); - calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); - } else - for (Value *Val : cast<PHINode>(v)->incoming_values()) - calculateMeet.meetWith(findBaseOrBDV(Val, cache)); - - PhiState oldState = states[v]; - PhiState newState = calculateMeet.getResult(); + // We're only changing values in this loop, thus safe to keep iterators. + // Since this is computing a fixed point, the order of visit does not + // effect the result. TODO: We could use a worklist here and make this run + // much faster. + for (auto Pair : States) { + Value *BDV = Pair.first; + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + + // Given an input value for the current instruction, return a BDVState + // instance which represents the BDV of that value. + auto getStateForInput = [&](Value *V) mutable { + Value *BDV = findBaseOrBDV(V, cache); + return getStateForBDV(BDV); + }; + + MeetBDVStates calculateMeet; + if (SelectInst *select = dyn_cast<SelectInst>(BDV)) { + calculateMeet.meetWith(getStateForInput(select->getTrueValue())); + calculateMeet.meetWith(getStateForInput(select->getFalseValue())); + } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) { + for (Value *Val : Phi->incoming_values()) + calculateMeet.meetWith(getStateForInput(Val)); + } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) { + // The 'meet' for an extractelement is slightly trivial, but it's still + // useful in that it drives us to conflict if our input is. + calculateMeet.meetWith(getStateForInput(EE->getVectorOperand())); + } else { + // Given there's a inherent type mismatch between the operands, will + // *always* produce Conflict. + auto *IE = cast<InsertElementInst>(BDV); + calculateMeet.meetWith(getStateForInput(IE->getOperand(0))); + calculateMeet.meetWith(getStateForInput(IE->getOperand(1))); + } + + BDVState oldState = States[BDV]; + BDVState newState = calculateMeet.getResult(); if (oldState != newState) { progress = true; - states[v] = newState; + States[BDV] = newState; } } - assert(oldSize <= states.size()); - assert(oldSize == states.size() || progress); + assert(oldSize == States.size() && + "fixed point shouldn't be adding any new nodes to state"); } - if (TraceLSP) { - errs() << "States after meet iteration:\n"; - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; - state.dump(); - v->dump(); - } +#ifndef NDEBUG + DEBUG(dbgs() << "States after meet iteration:\n"); + for (auto Pair : States) { + DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } - +#endif + // Insert Phis for all conflicts - // We want to keep naming deterministic in the loop that follows, so - // sort the keys before iteration. This is useful in allowing us to - // write stable tests. Note that there is no invalidation issue here. - SmallVector<Value *, 16> Keys; - Keys.reserve(states.size()); - for (auto Pair : states) { - Value *V = Pair.first; - Keys.push_back(V); - } - std::sort(Keys.begin(), Keys.end(), order_by_name); // TODO: adjust naming patterns to avoid this order of iteration dependency - for (Value *V : Keys) { - Instruction *v = cast<Instruction>(V); - PhiState state = states[V]; - assert(!isKnownBaseResult(v) && "why did it get added?"); - assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (!state.isConflict()) + for (auto Pair : States) { + Instruction *I = cast<Instruction>(Pair.first); + BDVState State = Pair.second; + assert(!isKnownBaseResult(I) && "why did it get added?"); + assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + + // extractelement instructions are a bit special in that we may need to + // insert an extract even when we know an exact base for the instruction. + // The problem is that we need to convert from a vector base to a scalar + // base for the particular indice we're interested in. + if (State.isBase() && isa<ExtractElementInst>(I) && + isa<VectorType>(State.getBase()->getType())) { + auto *EE = cast<ExtractElementInst>(I); + // TODO: In many cases, the new instruction is just EE itself. We should + // exploit this, but can't do it here since it would break the invariant + // about the BDV not being known to be a base. + auto *BaseInst = ExtractElementInst::Create(State.getBase(), + EE->getIndexOperand(), + "base_ee", EE); + BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); + States[I] = BDVState(BDVState::Base, BaseInst); + } + + // Since we're joining a vector and scalar base, they can never be the + // same. As a result, we should always see insert element having reached + // the conflict state. + if (isa<InsertElementInst>(I)) { + assert(State.isConflict()); + } + + if (!State.isConflict()) continue; - if (isa<PHINode>(v)) { - int num_preds = - std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); - assert(num_preds > 0 && "how did we reach here"); - PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - phi->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, phi); + /// Create and insert a new instruction which will represent the base of + /// the given instruction 'I'. + auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { + if (isa<PHINode>(I)) { + BasicBlock *BB = I->getParent(); + int NumPreds = std::distance(pred_begin(BB), pred_end(BB)); + assert(NumPreds > 0 && "how did we reach here"); + std::string Name = suffixed_name_or(I, ".base", "base_phi"); + return PHINode::Create(I->getType(), NumPreds, Name, I); + } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) { + // The undef will be replaced later + UndefValue *Undef = UndefValue::get(Sel->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_select"); + return SelectInst::Create(Sel->getCondition(), Undef, + Undef, Name, Sel); + } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { + UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_ee"); + return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name, + EE); + } else { + auto *IE = cast<InsertElementInst>(I); + UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType()); + UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_ie"); + return InsertElementInst::Create(VecUndef, ScalarUndef, + IE->getOperand(2), Name, IE); + } + + }; + Instruction *BaseInst = MakeBaseInstPlaceholder(I); + // Add metadata marking this as a base value + BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); + States[I] = BDVState(BDVState::Conflict, BaseInst); + } + + // Returns a instruction which produces the base pointer for a given + // instruction. The instruction is assumed to be an input to one of the BDVs + // seen in the inference algorithm above. As such, we must either already + // know it's base defining value is a base, or have inserted a new + // instruction to propagate the base of it's BDV and have entered that newly + // introduced instruction into the state table. In either case, we are + // assured to be able to determine an instruction which produces it's base + // pointer. + auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { + Value *BDV = findBaseOrBDV(Input, cache); + Value *Base = nullptr; + if (isKnownBaseResult(BDV)) { + Base = BDV; } else { - SelectInst *sel = cast<SelectInst>(v); - // The undef will be replaced later - UndefValue *undef = UndefValue::get(sel->getType()); - SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, - undef, "base_select", sel); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - basesel->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, basesel); + // Either conflict or base. + assert(States.count(BDV)); + Base = States[BDV].getBase(); } - } + assert(Base && "can't be null"); + // The cast is needed since base traversal may strip away bitcasts + if (Base->getType() != Input->getType() && + InsertPt) { + Base = new BitCastInst(Base, Input->getType(), "cast", + InsertPt); + } + return Base; + }; - // Fixup all the inputs of the new PHIs - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; + // Fixup all the inputs of the new PHIs. Visit order needs to be + // deterministic and predictable because we're naming newly created + // instructions. + for (auto Pair : States) { + Instruction *BDV = cast<Instruction>(Pair.first); + BDVState State = Pair.second; - assert(!isKnownBaseResult(v) && "why did it get added?"); - assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (!state.isConflict()) + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + if (!State.isConflict()) continue; - if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { - PHINode *phi = cast<PHINode>(v); + if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) { + PHINode *phi = cast<PHINode>(BDV); unsigned NumPHIValues = phi->getNumIncomingValues(); for (unsigned i = 0; i < NumPHIValues; i++) { Value *InVal = phi->getIncomingValue(i); @@ -906,104 +1002,145 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { if (blockIndex != -1) { Value *oldBase = basephi->getIncomingValue(blockIndex); basephi->addIncoming(oldBase, InBB); + #ifndef NDEBUG - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - - // In essense this assert states: the only way two + Value *Base = getBaseForInput(InVal, nullptr); + // In essence this assert states: the only way two // values incoming from the same basic block may be // different is by being different bitcasts of the same // value. A cleanup that remains TODO is changing // findBaseOrBDV to return an llvm::Value of the correct // type (and still remain pure). This will remove the // need to add bitcasts. - assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() && "sanity -- findBaseOrBDV should be pure!"); #endif continue; } - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basephi->getType()) { - base = new BitCastInst(base, basephi->getType(), "cast", - InBB->getTerminator()); - } - basephi->addIncoming(base, InBB); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast in the incoming block. + // TODO: Need to split critical edges if insertion is needed + Value *Base = getBaseForInput(InVal, InBB->getTerminator()); + basephi->addIncoming(Base, InBB); } assert(basephi->getNumIncomingValues() == NumPHIValues); - } else { - SelectInst *basesel = cast<SelectInst>(state.getBase()); - SelectInst *sel = cast<SelectInst>(v); + } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) { + SelectInst *Sel = cast<SelectInst>(BDV); // Operand 1 & 2 are true, false path respectively. TODO: refactor to // something more safe and less hacky. for (int i = 1; i <= 2; i++) { - Value *InVal = sel->getOperand(i); - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basesel->getType()) { - base = new BitCastInst(base, basesel->getType(), "cast", basesel); - } - basesel->setOperand(i, base); + Value *InVal = Sel->getOperand(i); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast. + Value *Base = getBaseForInput(InVal, BaseSel); + BaseSel->setOperand(i, Base); } + } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) { + Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand(); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast. + Value *Base = getBaseForInput(InVal, BaseEE); + BaseEE->setOperand(0, Base); + } else { + auto *BaseIE = cast<InsertElementInst>(State.getBase()); + auto *BdvIE = cast<InsertElementInst>(BDV); + auto UpdateOperand = [&](int OperandIdx) { + Value *InVal = BdvIE->getOperand(OperandIdx); + Value *Base = getBaseForInput(InVal, BaseIE); + BaseIE->setOperand(OperandIdx, Base); + }; + UpdateOperand(0); // vector operand + UpdateOperand(1); // scalar operand + } + + } + + // Now that we're done with the algorithm, see if we can optimize the + // results slightly by reducing the number of new instructions needed. + // Arguably, this should be integrated into the algorithm above, but + // doing as a post process step is easier to reason about for the moment. + DenseMap<Value *, Value *> ReverseMap; + SmallPtrSet<Instruction *, 16> NewInsts; + SmallSetVector<AssertingVH<Instruction>, 16> Worklist; + // Note: We need to visit the states in a deterministic order. We uses the + // Keys we sorted above for this purpose. Note that we are papering over a + // bigger problem with the algorithm above - it's visit order is not + // deterministic. A larger change is needed to fix this. + for (auto Pair : States) { + auto *BDV = Pair.first; + auto State = Pair.second; + Value *Base = State.getBase(); + assert(BDV && Base); + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + assert(isKnownBaseResult(Base) && + "must be something we 'know' is a base pointer"); + if (!State.isConflict()) + continue; + + ReverseMap[Base] = BDV; + if (auto *BaseI = dyn_cast<Instruction>(Base)) { + NewInsts.insert(BaseI); + Worklist.insert(BaseI); + } + } + auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI, + Value *Replacement) { + // Add users which are new instructions (excluding self references) + for (User *U : BaseI->users()) + if (auto *UI = dyn_cast<Instruction>(U)) + if (NewInsts.count(UI) && UI != BaseI) + Worklist.insert(UI); + // Then do the actual replacement + NewInsts.erase(BaseI); + ReverseMap.erase(BaseI); + BaseI->replaceAllUsesWith(Replacement); + assert(States.count(BDV)); + assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI); + States[BDV] = BDVState(BDVState::Conflict, Replacement); + BaseI->eraseFromParent(); + }; + const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout(); + while (!Worklist.empty()) { + Instruction *BaseI = Worklist.pop_back_val(); + assert(NewInsts.count(BaseI)); + Value *Bdv = ReverseMap[BaseI]; + if (auto *BdvI = dyn_cast<Instruction>(Bdv)) + if (BaseI->isIdenticalTo(BdvI)) { + DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n"); + ReplaceBaseInstWith(Bdv, BaseI, Bdv); + continue; + } + if (Value *V = SimplifyInstruction(BaseI, DL)) { + DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n"); + ReplaceBaseInstWith(Bdv, BaseI, V); + continue; } } // Cache all of our results so we can cheaply reuse them // NOTE: This is actually two caches: one of the base defining value // relation and one of the base pointer relation! FIXME - for (auto item : states) { - Value *v = item.first; - Value *base = item.second.getBase(); - assert(v && base); - assert(!isKnownBaseResult(v) && "why did it get added?"); - - if (TraceLSP) { - std::string fromstr = - cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") - : "none"; - errs() << "Updating base value cache" - << " for: " << (v->hasName() ? v->getName() : "") - << " from: " << fromstr - << " to: " << (base->hasName() ? base->getName() : "") << "\n"; - } - - assert(isKnownBaseResult(base) && - "must be something we 'know' is a base pointer"); - if (cache.count(v)) { + for (auto Pair : States) { + auto *BDV = Pair.first; + Value *base = Pair.second.getBase(); + assert(BDV && base); + + std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none"; + DEBUG(dbgs() << "Updating base value cache" + << " for: " << BDV->getName() + << " from: " << fromstr + << " to: " << base->getName() << "\n"); + + if (cache.count(BDV)) { // Once we transition from the BDV relation being store in the cache to // the base relation being stored, it must be stable - assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && + assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) && "base relation should be stable"); } - cache[v] = base; + cache[BDV] = base; } - assert(cache.find(def) != cache.end()); + assert(cache.count(def)); return cache[def]; } @@ -1024,7 +1161,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // pointer was a base pointer. static void findBasePointers(const StatepointLiveSetTy &live, - DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, + DenseMap<Value *, Value *> &PointerToBase, DominatorTree *DT, DefiningValueMapTy &DVCache) { // For the naming of values inserted to be deterministic - which makes for // much cleaner and more stable tests - we need to assign an order to the @@ -1043,7 +1180,7 @@ findBasePointers(const StatepointLiveSetTy &live, // If you see this trip and like to live really dangerously, the code should // be correct, just with idioms the verifier can't handle. You can try - // disabling the verifier at your own substaintial risk. + // disabling the verifier at your own substantial risk. assert(!isa<ConstantPointerNull>(base) && "the relocation code needs adjustment to handle the relocation of " "a null pointer constant without causing false positives in the " @@ -1056,8 +1193,8 @@ findBasePointers(const StatepointLiveSetTy &live, static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, const CallSite &CS, PartiallyConstructedSafepointRecord &result) { - DenseMap<llvm::Value *, llvm::Value *> PointerToBase; - findBasePointers(result.liveset, PointerToBase, &DT, DVCache); + DenseMap<Value *, Value *> PointerToBase; + findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache); if (PrintBasePointers) { // Note: Need to print these in a stable order since this is checked in @@ -1071,8 +1208,11 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, std::sort(Temp.begin(), Temp.end(), order_by_name); for (Value *Ptr : Temp) { Value *Base = PointerToBase[Ptr]; - errs() << " derived %" << Ptr->getName() << " base %" << Base->getName() - << "\n"; + errs() << " derived "; + Ptr->printAsOperand(errs(), false); + errs() << " base "; + Base->printAsOperand(errs(), false); + errs() << "\n";; } } @@ -1086,10 +1226,10 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, PartiallyConstructedSafepointRecord &result); static void recomputeLiveInValues( - Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate, MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { // TODO-PERF: reuse the original liveness, then simply run the dataflow - // again. The old values are still live and will help it stablize quickly. + // again. The old values are still live and will help it stabilize quickly. GCPtrLivenessData RevisedLivenessData; computeLiveInValues(DT, F, RevisedLivenessData); for (size_t i = 0; i < records.size(); i++) { @@ -1099,69 +1239,66 @@ static void recomputeLiveInValues( } } -// When inserting gc.relocate calls, we need to ensure there are no uses -// of the original value between the gc.statepoint and the gc.relocate call. -// One case which can arise is a phi node starting one of the successor blocks. -// We also need to be able to insert the gc.relocates only on the path which -// goes through the statepoint. We might need to split an edge to make this -// possible. +// When inserting gc.relocate and gc.result calls, we need to ensure there are +// no uses of the original value / return value between the gc.statepoint and +// the gc.relocate / gc.result call. One case which can arise is a phi node +// starting one of the successor blocks. We also need to be able to insert the +// gc.relocates only on the path which goes through the statepoint. We might +// need to split an edge to make this possible. static BasicBlock * normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, DominatorTree &DT) { BasicBlock *Ret = BB; - if (!BB->getUniquePredecessor()) { - Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, &DT); - } + if (!BB->getUniquePredecessor()) + Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT); - // Now that 'ret' has unique predecessor we can safely remove all phi nodes + // Now that 'Ret' has unique predecessor we can safely remove all phi nodes // from it FoldSingleEntryPHINodes(Ret); - assert(!isa<PHINode>(Ret->begin())); + assert(!isa<PHINode>(Ret->begin()) && + "All PHI nodes should have been removed!"); - // At this point, we can safely insert a gc.relocate as the first instruction - // in Ret if needed. + // At this point, we can safely insert a gc.relocate or gc.result as the first + // instruction in Ret if needed. return Ret; } -static int find_index(ArrayRef<Value *> livevec, Value *val) { - auto itr = std::find(livevec.begin(), livevec.end(), val); - assert(livevec.end() != itr); - size_t index = std::distance(livevec.begin(), itr); - assert(index < livevec.size()); - return index; -} - -// Create new attribute set containing only attributes which can be transfered +// Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeSet legalizeCallAttributes(AttributeSet AS) { - AttributeSet ret; + AttributeSet Ret; for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { - unsigned index = AS.getSlotIndex(Slot); + unsigned Index = AS.getSlotIndex(Slot); - if (index == AttributeSet::ReturnIndex || - index == AttributeSet::FunctionIndex) { + if (Index == AttributeSet::ReturnIndex || + Index == AttributeSet::FunctionIndex) { - for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; - ++it) { - Attribute attr = *it; + for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) { // Do not allow certain attributes - just skip them // Safepoint can not be read only or read none. - if (attr.hasAttribute(Attribute::ReadNone) || - attr.hasAttribute(Attribute::ReadOnly)) + if (Attr.hasAttribute(Attribute::ReadNone) || + Attr.hasAttribute(Attribute::ReadOnly)) + continue; + + // These attributes control the generation of the gc.statepoint call / + // invoke itself; and once the gc.statepoint is in place, they're of no + // use. + if (Attr.hasAttribute("statepoint-num-patch-bytes") || + Attr.hasAttribute("statepoint-id")) continue; - ret = ret.addAttributes( - AS.getContext(), index, - AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); + Ret = Ret.addAttributes( + AS.getContext(), Index, + AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr))); } } // Just skip parameter attributes for now } - return ret; + return Ret; } /// Helper function to place all gc relocates necessary for the given @@ -1173,225 +1310,290 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) { /// statepointToken - statepoint instruction to which relocates should be /// bound. /// Builder - Llvm IR builder to be used to construct new calls. -static void CreateGCRelocates(ArrayRef<llvm::Value *> LiveVariables, +static void CreateGCRelocates(ArrayRef<Value *> LiveVariables, const int LiveStart, - ArrayRef<llvm::Value *> BasePtrs, + ArrayRef<Value *> BasePtrs, Instruction *StatepointToken, IRBuilder<> Builder) { - SmallVector<Instruction *, 64> NewDefs; - NewDefs.reserve(LiveVariables.size()); + if (LiveVariables.empty()) + return; - Module *M = StatepointToken->getParent()->getParent()->getParent(); + auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) { + auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val); + assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); + size_t Index = std::distance(LiveVec.begin(), ValIt); + assert(Index < LiveVec.size() && "Bug in std::find?"); + return Index; + }; - for (unsigned i = 0; i < LiveVariables.size(); i++) { - // We generate a (potentially) unique declaration for every pointer type - // combination. This results is some blow up the function declarations in - // the IR, but removes the need for argument bitcasts which shrinks the IR - // greatly and makes it much more readable. - SmallVector<Type *, 1> Types; // one per 'any' type - // All gc_relocate are set to i8 addrspace(1)* type. This could help avoid - // cases where the actual value's type mangling is not supported by llvm. A - // bitcast is added later to convert gc_relocate to the actual value's type. - Types.push_back(Type::getInt8PtrTy(M->getContext(), 1)); - Value *GCRelocateDecl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_gc_relocate, Types); + // All gc_relocate are set to i8 addrspace(1)* type. We originally generated + // unique declarations for each pointer type, but this proved problematic + // because the intrinsic mangling code is incomplete and fragile. Since + // we're moving towards a single unified pointer type anyways, we can just + // cast everything to an i8* of the right address space. A bitcast is added + // later to convert gc_relocate to the actual value's type. + Module *M = StatepointToken->getModule(); + auto AS = cast<PointerType>(LiveVariables[0]->getType())->getAddressSpace(); + Type *Types[] = {Type::getInt8PtrTy(M->getContext(), AS)}; + Value *GCRelocateDecl = + Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types); + for (unsigned i = 0; i < LiveVariables.size(); i++) { // Generate the gc.relocate call and save the result Value *BaseIdx = - ConstantInt::get(Type::getInt32Ty(M->getContext()), - LiveStart + find_index(LiveVariables, BasePtrs[i])); - Value *LiveIdx = ConstantInt::get( - Type::getInt32Ty(M->getContext()), - LiveStart + find_index(LiveVariables, LiveVariables[i])); + Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i])); + Value *LiveIdx = Builder.getInt32(LiveStart + i); // only specify a debug name if we can give a useful one - Value *Reloc = Builder.CreateCall( + CallInst *Reloc = Builder.CreateCall( GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, - LiveVariables[i]->hasName() ? LiveVariables[i]->getName() + ".relocated" - : ""); + suffixed_name_or(LiveVariables[i], ".relocated", "")); // Trick CodeGen into thinking there are lots of free registers at this // fake call. - cast<CallInst>(Reloc)->setCallingConv(CallingConv::Cold); + Reloc->setCallingConv(CallingConv::Cold); + } +} - NewDefs.push_back(cast<Instruction>(Reloc)); +namespace { + +/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this +/// avoids having to worry about keeping around dangling pointers to Values. +class DeferredReplacement { + AssertingVH<Instruction> Old; + AssertingVH<Instruction> New; + +public: + explicit DeferredReplacement(Instruction *Old, Instruction *New) : + Old(Old), New(New) { + assert(Old != New && "Not allowed!"); } - assert(NewDefs.size() == LiveVariables.size() && - "missing or extra redefinition at safepoint"); + + /// Does the task represented by this instance. + void doReplacement() { + Instruction *OldI = Old; + Instruction *NewI = New; + + assert(OldI != NewI && "Disallowed at construction?!"); + + Old = nullptr; + New = nullptr; + + if (NewI) + OldI->replaceAllUsesWith(NewI); + OldI->eraseFromParent(); + } +}; } static void -makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ - const SmallVectorImpl<llvm::Value *> &basePtrs, - const SmallVectorImpl<llvm::Value *> &liveVariables, - Pass *P, - PartiallyConstructedSafepointRecord &result) { - assert(basePtrs.size() == liveVariables.size()); - assert(isStatepoint(CS) && +makeStatepointExplicitImpl(const CallSite CS, /* to replace */ + const SmallVectorImpl<Value *> &BasePtrs, + const SmallVectorImpl<Value *> &LiveVariables, + PartiallyConstructedSafepointRecord &Result, + std::vector<DeferredReplacement> &Replacements) { + assert(BasePtrs.size() == LiveVariables.size()); + assert((UseDeoptBundles || isStatepoint(CS)) && "This method expects to be rewriting a statepoint"); - BasicBlock *BB = CS.getInstruction()->getParent(); - assert(BB); - Function *F = BB->getParent(); - assert(F && "must be set"); - Module *M = F->getParent(); - (void)M; - assert(M && "must be set"); - - // We're not changing the function signature of the statepoint since the gc - // arguments go into the var args section. - Function *gc_statepoint_decl = CS.getCalledFunction(); - // Then go ahead and use the builder do actually do the inserts. We insert // immediately before the previous instruction under the assumption that all // arguments will be available here. We can't insert afterwards since we may // be replacing a terminator. - Instruction *insertBefore = CS.getInstruction(); - IRBuilder<> Builder(insertBefore); - // Copy all of the arguments from the original statepoint - this includes the - // target, call args, and deopt args - SmallVector<llvm::Value *, 64> args; - args.insert(args.end(), CS.arg_begin(), CS.arg_end()); - // TODO: Clear the 'needs rewrite' flag - - // add all the pointers to be relocated (gc arguments) - // Capture the start of the live variable list for use in the gc_relocates - const int live_start = args.size(); - args.insert(args.end(), liveVariables.begin(), liveVariables.end()); + Instruction *InsertBefore = CS.getInstruction(); + IRBuilder<> Builder(InsertBefore); + + ArrayRef<Value *> GCArgs(LiveVariables); + uint64_t StatepointID = 0xABCDEF00; + uint32_t NumPatchBytes = 0; + uint32_t Flags = uint32_t(StatepointFlags::None); + + ArrayRef<Use> CallArgs; + ArrayRef<Use> DeoptArgs; + ArrayRef<Use> TransitionArgs; + + Value *CallTarget = nullptr; + + if (UseDeoptBundles) { + CallArgs = {CS.arg_begin(), CS.arg_end()}; + DeoptArgs = GetDeoptBundleOperands(CS); + // TODO: we don't fill in TransitionArgs or Flags in this branch, but we + // could have an operand bundle for that too. + AttributeSet OriginalAttrs = CS.getAttributes(); + + Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, + "statepoint-id"); + if (AttrID.isStringAttribute()) + AttrID.getValueAsString().getAsInteger(10, StatepointID); + + Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( + AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); + if (AttrNumPatchBytes.isStringAttribute()) + AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); + + CallTarget = CS.getCalledValue(); + } else { + // This branch will be gone soon, and we will soon only support the + // UseDeoptBundles == true configuration. + Statepoint OldSP(CS); + StatepointID = OldSP.getID(); + NumPatchBytes = OldSP.getNumPatchBytes(); + Flags = OldSP.getFlags(); + + CallArgs = {OldSP.arg_begin(), OldSP.arg_end()}; + DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()}; + TransitionArgs = {OldSP.gc_transition_args_begin(), + OldSP.gc_transition_args_end()}; + CallTarget = OldSP.getCalledValue(); + } // Create the statepoint given all the arguments - Instruction *token = nullptr; - AttributeSet return_attributes; + Instruction *Token = nullptr; + AttributeSet ReturnAttrs; if (CS.isCall()) { - CallInst *toReplace = cast<CallInst>(CS.getInstruction()); - CallInst *call = - Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); - call->setTailCall(toReplace->isTailCall()); - call->setCallingConv(toReplace->getCallingConv()); + CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); + CallInst *Call = Builder.CreateGCStatepointCall( + StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs, + TransitionArgs, DeoptArgs, GCArgs, "safepoint_token"); + + Call->setTailCall(ToReplace->isTailCall()); + Call->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. - AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); - // In case if we can handle this set of sttributes - set up function attrs + AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); + // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. - call->setAttributes(new_attrs.getFnAttributes()); - return_attributes = new_attrs.getRetAttributes(); + Call->setAttributes(NewAttrs.getFnAttributes()); + ReturnAttrs = NewAttrs.getRetAttributes(); - token = call; + Token = Call; // Put the following gc_result and gc_relocate calls immediately after the // the old call (which we're about to delete) - BasicBlock::iterator next(toReplace); - assert(BB->end() != next && "not a terminator, must have next"); - next++; - Instruction *IP = &*(next); - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(IP->getDebugLoc()); - + assert(ToReplace->getNextNode() && "Not a terminator, must have next!"); + Builder.SetInsertPoint(ToReplace->getNextNode()); + Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc()); } else { - InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction()); // Insert the new invoke into the old block. We'll remove the old one in a // moment at which point this will become the new terminator for the // original block. - InvokeInst *invoke = InvokeInst::Create( - gc_statepoint_decl, toReplace->getNormalDest(), - toReplace->getUnwindDest(), args, "", toReplace->getParent()); - invoke->setCallingConv(toReplace->getCallingConv()); + InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( + StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(), + ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, + GCArgs, "statepoint_token"); + + Invoke->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. - AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); - // In case if we can handle this set of sttributes - set up function attrs + AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); + // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. - invoke->setAttributes(new_attrs.getFnAttributes()); - return_attributes = new_attrs.getRetAttributes(); + Invoke->setAttributes(NewAttrs.getFnAttributes()); + ReturnAttrs = NewAttrs.getRetAttributes(); - token = invoke; + Token = Invoke; // Generate gc relocates in exceptional path - BasicBlock *unwindBlock = toReplace->getUnwindDest(); - assert(!isa<PHINode>(unwindBlock->begin()) && - unwindBlock->getUniquePredecessor() && + BasicBlock *UnwindBlock = ToReplace->getUnwindDest(); + assert(!isa<PHINode>(UnwindBlock->begin()) && + UnwindBlock->getUniquePredecessor() && "can't safely insert in this block!"); - Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); + Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); - // Extract second element from landingpad return value. We will attach - // exceptional gc relocates to it. - const unsigned idx = 1; - Instruction *exceptional_token = - cast<Instruction>(Builder.CreateExtractValue( - unwindBlock->getLandingPadInst(), idx, "relocate_token")); - result.UnwindToken = exceptional_token; + // Attach exceptional gc relocates to the landingpad. + Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst(); + Result.UnwindToken = ExceptionalToken; - // Just throw away return value. We will use the one we got for normal - // block. - (void)CreateGCRelocates(liveVariables, live_start, basePtrs, - exceptional_token, Builder); + const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); + CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken, + Builder); // Generate gc relocates and returns for normal block - BasicBlock *normalDest = toReplace->getNormalDest(); - assert(!isa<PHINode>(normalDest->begin()) && - normalDest->getUniquePredecessor() && + BasicBlock *NormalDest = ToReplace->getNormalDest(); + assert(!isa<PHINode>(NormalDest->begin()) && + NormalDest->getUniquePredecessor() && "can't safely insert in this block!"); - IP = &*(normalDest->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); + Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt()); // gc relocates will be generated later as if it were regular call // statepoint } - assert(token); - - // Take the name of the original value call if it had one. - token->takeName(CS.getInstruction()); + assert(Token && "Should be set in one of the above branches!"); + + if (UseDeoptBundles) { + Token->setName("statepoint_token"); + if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { + StringRef Name = + CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; + CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name); + GCResult->setAttributes(CS.getAttributes().getRetAttributes()); + + // We cannot RAUW or delete CS.getInstruction() because it could be in the + // live set of some other safepoint, in which case that safepoint's + // PartiallyConstructedSafepointRecord will hold a raw pointer to this + // llvm::Instruction. Instead, we defer the replacement and deletion to + // after the live sets have been made explicit in the IR, and we no longer + // have raw pointers to worry about. + Replacements.emplace_back(CS.getInstruction(), GCResult); + } else { + Replacements.emplace_back(CS.getInstruction(), nullptr); + } + } else { + assert(!CS.getInstruction()->hasNUsesOrMore(2) && + "only valid use before rewrite is gc.result"); + assert(!CS.getInstruction()->hasOneUse() || + isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin()))); -// The GCResult is already inserted, we just need to find it -#ifndef NDEBUG - Instruction *toReplace = CS.getInstruction(); - assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && - "only valid use before rewrite is gc.result"); - assert(!toReplace->hasOneUse() || - isGCResult(cast<Instruction>(*toReplace->user_begin()))); -#endif + // Take the name of the original statepoint token if there was one. + Token->takeName(CS.getInstruction()); - // Update the gc.result of the original statepoint (if any) to use the newly - // inserted statepoint. This is safe to do here since the token can't be - // considered a live reference. - CS.getInstruction()->replaceAllUsesWith(token); + // Update the gc.result of the original statepoint (if any) to use the newly + // inserted statepoint. This is safe to do here since the token can't be + // considered a live reference. + CS.getInstruction()->replaceAllUsesWith(Token); + CS.getInstruction()->eraseFromParent(); + } - result.StatepointToken = token; + Result.StatepointToken = Token; // Second, create a gc.relocate for every live variable - CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); + const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); + CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder); } namespace { -struct name_ordering { - Value *base; - Value *derived; - bool operator()(name_ordering const &a, name_ordering const &b) { - return -1 == a.derived->getName().compare(b.derived->getName()); +struct NameOrdering { + Value *Base; + Value *Derived; + + bool operator()(NameOrdering const &a, NameOrdering const &b) { + return -1 == a.Derived->getName().compare(b.Derived->getName()); } }; } -static void stablize_order(SmallVectorImpl<Value *> &basevec, - SmallVectorImpl<Value *> &livevec) { - assert(basevec.size() == livevec.size()); - - SmallVector<name_ordering, 64> temp; - for (size_t i = 0; i < basevec.size(); i++) { - name_ordering v; - v.base = basevec[i]; - v.derived = livevec[i]; - temp.push_back(v); - } - std::sort(temp.begin(), temp.end(), name_ordering()); - for (size_t i = 0; i < basevec.size(); i++) { - basevec[i] = temp[i].base; - livevec[i] = temp[i].derived; + +static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec, + SmallVectorImpl<Value *> &LiveVec) { + assert(BaseVec.size() == LiveVec.size()); + + SmallVector<NameOrdering, 64> Temp; + for (size_t i = 0; i < BaseVec.size(); i++) { + NameOrdering v; + v.Base = BaseVec[i]; + v.Derived = LiveVec[i]; + Temp.push_back(v); + } + + std::sort(Temp.begin(), Temp.end(), NameOrdering()); + for (size_t i = 0; i < BaseVec.size(); i++) { + BaseVec[i] = Temp[i].Base; + LiveVec[i] = Temp[i].Derived; } } @@ -1401,40 +1603,39 @@ static void stablize_order(SmallVectorImpl<Value *> &basevec, // WARNING: Does not do any fixup to adjust users of the original live // values. That's the callers responsibility. static void -makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, - PartiallyConstructedSafepointRecord &result) { - auto liveset = result.liveset; - auto PointerToBase = result.PointerToBase; +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, + PartiallyConstructedSafepointRecord &Result, + std::vector<DeferredReplacement> &Replacements) { + const auto &LiveSet = Result.LiveSet; + const auto &PointerToBase = Result.PointerToBase; // Convert to vector for efficient cross referencing. - SmallVector<Value *, 64> basevec, livevec; - livevec.reserve(liveset.size()); - basevec.reserve(liveset.size()); - for (Value *L : liveset) { - livevec.push_back(L); - - assert(PointerToBase.find(L) != PointerToBase.end()); - Value *base = PointerToBase[L]; - basevec.push_back(base); + SmallVector<Value *, 64> BaseVec, LiveVec; + LiveVec.reserve(LiveSet.size()); + BaseVec.reserve(LiveSet.size()); + for (Value *L : LiveSet) { + LiveVec.push_back(L); + assert(PointerToBase.count(L)); + Value *Base = PointerToBase.find(L)->second; + BaseVec.push_back(Base); } - assert(livevec.size() == basevec.size()); + assert(LiveVec.size() == BaseVec.size()); // To make the output IR slightly more stable (for use in diffs), ensure a // fixed order of the values in the safepoint (by sorting the value name). // The order is otherwise meaningless. - stablize_order(basevec, livevec); + StabilizeOrder(BaseVec, LiveVec); // Do the actual rewriting and delete the old statepoint - makeStatepointExplicitImpl(CS, basevec, livevec, P, result); - CS.getInstruction()->eraseFromParent(); + makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements); } // Helper function for the relocationViaAlloca. -// It receives iterator to the statepoint gc relocates and emits store to the -// assigned -// location (via allocaMap) for the each one of them. -// Add visited values into the visitedLiveValues set we will later use them -// for sanity check. +// +// It receives iterator to the statepoint gc relocates and emits a store to the +// assigned location (via allocaMap) for the each one of them. It adds the +// visited values into the visitedLiveValues set, which we will later use them +// for sanity checking. static void insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, DenseMap<Value *, Value *> &AllocaMap, @@ -1459,13 +1660,15 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, Value *Alloca = AllocaMap[OriginalValue]; // Emit store into the related alloca - // All gc_relocate are i8 addrspace(1)* typed, and it must be bitcasted to + // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to // the correct type according to alloca. - assert(RelocatedValue->getNextNode() && "Should always have one since it's not a terminator"); + assert(RelocatedValue->getNextNode() && + "Should always have one since it's not a terminator"); IRBuilder<> Builder(RelocatedValue->getNextNode()); Value *CastedRelocatedValue = - Builder.CreateBitCast(RelocatedValue, cast<AllocaInst>(Alloca)->getAllocatedType(), - RelocatedValue->hasName() ? RelocatedValue->getName() + ".casted" : ""); + Builder.CreateBitCast(RelocatedValue, + cast<AllocaInst>(Alloca)->getAllocatedType(), + suffixed_name_or(RelocatedValue, ".casted", "")); StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca); Store->insertAfter(cast<Instruction>(CastedRelocatedValue)); @@ -1501,10 +1704,10 @@ insertRematerializationStores( } } -/// do all the relocation update via allocas and mem2reg +/// Do all the relocation update via allocas and mem2reg static void relocationViaAlloca( Function &F, DominatorTree &DT, ArrayRef<Value *> Live, - ArrayRef<struct PartiallyConstructedSafepointRecord> Records) { + ArrayRef<PartiallyConstructedSafepointRecord> Records) { #ifndef NDEBUG // record initial number of (static) allocas; we'll check we have the same // number when we get done. @@ -1531,15 +1734,12 @@ static void relocationViaAlloca( PromotableAllocas.push_back(Alloca); }; - // emit alloca for each live gc pointer - for (unsigned i = 0; i < Live.size(); i++) { - emitAllocaFor(Live[i]); - } - - // emit allocas for rematerialized values - for (size_t i = 0; i < Records.size(); i++) { - const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + // Emit alloca for each live gc pointer + for (Value *V : Live) + emitAllocaFor(V); + // Emit allocas for rematerialized values + for (const auto &Info : Records) for (auto RematerializedValuePair : Info.RematerializedValues) { Value *OriginalValue = RematerializedValuePair.second; if (AllocaMap.count(OriginalValue) != 0) @@ -1548,20 +1748,17 @@ static void relocationViaAlloca( emitAllocaFor(OriginalValue); ++NumRematerializedValues; } - } // The next two loops are part of the same conceptual operation. We need to // insert a store to the alloca after the original def and at each // redefinition. We need to insert a load before each use. These are split // into distinct loops for performance reasons. - // update gc pointer after each statepoint - // either store a relocated value or null (if no relocated value found for - // this gc pointer and it is not a gc_result) - // this must happen before we update the statepoint with load of alloca - // otherwise we lose the link between statepoint and old def - for (size_t i = 0; i < Records.size(); i++) { - const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + // Update gc pointer after each statepoint: either store a relocated value or + // null (if no relocated value was found for this gc pointer and it is not a + // gc_result). This must happen before we update the statepoint with load of + // alloca otherwise we lose the link between statepoint and old def. + for (const auto &Info : Records) { Value *Statepoint = Info.StatepointToken; // This will be used for consistency check @@ -1582,7 +1779,7 @@ static void relocationViaAlloca( VisitedLiveValues); if (ClobberNonLive) { - // As a debuging aid, pretend that an unrelocated pointer becomes null at + // As a debugging aid, pretend that an unrelocated pointer becomes null at // the gc.statepoint. This will turn some subtle GC problems into // slightly easier to debug SEGVs. Note that on large IR files with // lots of gc.statepoints this is extremely costly both memory and time @@ -1612,23 +1809,22 @@ static void relocationViaAlloca( // Insert the clobbering stores. These may get intermixed with the // gc.results and gc.relocates, but that's fine. if (auto II = dyn_cast<InvokeInst>(Statepoint)) { - InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); - InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); + InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt()); } else { - BasicBlock::iterator Next(cast<CallInst>(Statepoint)); - Next++; - InsertClobbersAt(Next); + InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode()); } } } - // update use with load allocas and add store for gc_relocated + + // Update use with load allocas and add store for gc_relocated. for (auto Pair : AllocaMap) { Value *Def = Pair.first; Value *Alloca = Pair.second; - // we pre-record the uses of allocas so that we dont have to worry about - // later update - // that change the user information. + // We pre-record the uses of allocas so that we dont have to worry about + // later update that changes the user information.. + SmallVector<Instruction *, 20> Uses; // PERF: trade a linear scan for repeated reallocation Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); @@ -1663,9 +1859,9 @@ static void relocationViaAlloca( } } - // emit store for the initial gc value - // store must be inserted after load, otherwise store will be in alloca's - // use list and an extra load will be inserted before it + // Emit store for the initial gc value. Store must be inserted after load, + // otherwise store will be in alloca's use list and an extra load will be + // inserted before it. StoreInst *Store = new StoreInst(Def, Alloca); if (Instruction *Inst = dyn_cast<Instruction>(Def)) { if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) { @@ -1688,14 +1884,13 @@ static void relocationViaAlloca( assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && "we must have the same allocas with lives"); if (!PromotableAllocas.empty()) { - // apply mem2reg to promote alloca to SSA + // Apply mem2reg to promote alloca to SSA PromoteMemToReg(PromotableAllocas, DT); } #ifndef NDEBUG - for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; - I++) - if (isa<AllocaInst>(*I)) + for (auto &I : F.getEntryBlock()) + if (isa<AllocaInst>(I)) InitialAllocaNum--; assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas"); #endif @@ -1719,28 +1914,27 @@ static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values, // No values to hold live, might as well not insert the empty holder return; - Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); + Module *M = CS.getInstruction()->getModule(); // Use a dummy vararg function to actually hold the values live Function *Func = cast<Function>(M->getOrInsertFunction( "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true))); if (CS.isCall()) { // For call safepoints insert dummy calls right after safepoint - BasicBlock::iterator Next(CS.getInstruction()); - Next++; - Holders.push_back(CallInst::Create(Func, Values, "", Next)); + Holders.push_back(CallInst::Create(Func, Values, "", + &*++CS.getInstruction()->getIterator())); return; } // For invoke safepooints insert dummy calls both in normal and // exceptional destination blocks auto *II = cast<InvokeInst>(CS.getInstruction()); Holders.push_back(CallInst::Create( - Func, Values, "", II->getNormalDest()->getFirstInsertionPt())); + Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt())); Holders.push_back(CallInst::Create( - Func, Values, "", II->getUnwindDest()->getFirstInsertionPt())); + Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt())); } static void findLiveReferences( - Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate, MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { GCPtrLivenessData OriginalLivenessData; computeLiveInValues(DT, F, OriginalLivenessData); @@ -1751,12 +1945,12 @@ static void findLiveReferences( } } -/// Remove any vector of pointers from the liveset by scalarizing them over the -/// statepoint instruction. Adds the scalarized pieces to the liveset. It -/// would be preferrable to include the vector in the statepoint itself, but +/// Remove any vector of pointers from the live set by scalarizing them over the +/// statepoint instruction. Adds the scalarized pieces to the live set. It +/// would be preferable to include the vector in the statepoint itself, but /// the lowering code currently does not handle that. Extending it would be /// slightly non-trivial since it requires a format change. Given how rare -/// such cases are (for the moment?) scalarizing is an acceptable comprimise. +/// such cases are (for the moment?) scalarizing is an acceptable compromise. static void splitVectorValues(Instruction *StatepointInst, StatepointLiveSetTy &LiveSet, DenseMap<Value *, Value *>& PointerToBase, @@ -1887,7 +2081,7 @@ static void splitVectorValues(Instruction *StatepointInst, // Helper function for the "rematerializeLiveValues". It walks use chain // starting from the "CurrentValue" until it meets "BaseValue". Only "simple" // values are visited (currently it is GEP's and casts). Returns true if it -// sucessfully reached "BaseValue" and false otherwise. +// successfully reached "BaseValue" and false otherwise. // Fills "ChainToBase" array with all visited values. "BaseValue" is not // recorded. static bool findRematerializableChainToBasePointer( @@ -1907,16 +2101,12 @@ static bool findRematerializableChainToBasePointer( } if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { - Value *Def = CI->stripPointerCasts(); - - // This two checks are basically similar. First one is here for the - // consistency with findBasePointers logic. - assert(!isa<CastInst>(Def) && "not a pointer cast found"); if (!CI->isNoopCast(CI->getModule()->getDataLayout())) return false; ChainToBase.push_back(CI); - return findRematerializableChainToBasePointer(ChainToBase, Def, BaseValue); + return findRematerializableChainToBasePointer(ChainToBase, + CI->getOperand(0), BaseValue); } // Not supported instruction in the chain @@ -1957,8 +2147,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, return Cost; } -// From the statepoint liveset pick values that are cheaper to recompute then to -// relocate. Remove this values from the liveset, rematerialize them after +// From the statepoint live set pick values that are cheaper to recompute then +// to relocate. Remove this values from the live set, rematerialize them after // statepoint and record them in "Info" structure. Note that similar to // relocated values we don't do any user adjustments here. static void rematerializeLiveValues(CallSite CS, @@ -1970,10 +2160,10 @@ static void rematerializeLiveValues(CallSite CS, // We can not di this in following loop due to iterator invalidation. SmallVector<Value *, 32> LiveValuesToBeDeleted; - for (Value *LiveValue: Info.liveset) { + for (Value *LiveValue: Info.LiveSet) { // For each live pointer find it's defining chain SmallVector<Instruction *, 3> ChainToBase; - assert(Info.PointerToBase.find(LiveValue) != Info.PointerToBase.end()); + assert(Info.PointerToBase.count(LiveValue)); bool FoundChain = findRematerializableChainToBasePointer(ChainToBase, LiveValue, @@ -2059,9 +2249,9 @@ static void rematerializeLiveValues(CallSite CS, InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction()); Instruction *NormalInsertBefore = - Invoke->getNormalDest()->getFirstInsertionPt(); + &*Invoke->getNormalDest()->getFirstInsertionPt(); Instruction *UnwindInsertBefore = - Invoke->getUnwindDest()->getFirstInsertionPt(); + &*Invoke->getUnwindDest()->getFirstInsertionPt(); Instruction *NormalRematerializedValue = rematerializeChain(NormalInsertBefore); @@ -2075,22 +2265,23 @@ static void rematerializeLiveValues(CallSite CS, // Remove rematerializaed values from the live set for (auto LiveValue: LiveValuesToBeDeleted) { - Info.liveset.erase(LiveValue); + Info.LiveSet.erase(LiveValue); } } -static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, - SmallVectorImpl<CallSite> &toUpdate) { +static bool insertParsePoints(Function &F, DominatorTree &DT, + TargetTransformInfo &TTI, + SmallVectorImpl<CallSite> &ToUpdate) { #ifndef NDEBUG // sanity check the input - std::set<CallSite> uniqued; - uniqued.insert(toUpdate.begin(), toUpdate.end()); - assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); + std::set<CallSite> Uniqued; + Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); + assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); - for (size_t i = 0; i < toUpdate.size(); i++) { - CallSite &CS = toUpdate[i]; + for (CallSite CS : ToUpdate) { assert(CS.getInstruction()->getParent()->getParent() == &F); - assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); + assert((UseDeoptBundles || isStatepoint(CS)) && + "expected to already be a deopt statepoint"); } #endif @@ -2098,50 +2289,45 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // the top of the successor blocks. See the comment on // normalForInvokeSafepoint on exactly what is needed. Note that this step // may restructure the CFG. - for (CallSite CS : toUpdate) { + for (CallSite CS : ToUpdate) { if (!CS.isInvoke()) continue; - InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); - normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(), - DT); - normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(), - DT); + auto *II = cast<InvokeInst>(CS.getInstruction()); + normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT); + normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT); } // A list of dummy calls added to the IR to keep various values obviously // live in the IR. We'll remove all of these when done. - SmallVector<CallInst *, 64> holders; + SmallVector<CallInst *, 64> Holders; // Insert a dummy call with all of the arguments to the vm_state we'll need // for the actual safepoint insertion. This ensures reference arguments in // the deopt argument list are considered live through the safepoint (and // thus makes sure they get relocated.) - for (size_t i = 0; i < toUpdate.size(); i++) { - CallSite &CS = toUpdate[i]; - Statepoint StatepointCS(CS); - + for (CallSite CS : ToUpdate) { SmallVector<Value *, 64> DeoptValues; - for (Use &U : StatepointCS.vm_state_args()) { - Value *Arg = cast<Value>(&U); + + iterator_range<const Use *> DeoptStateRange = + UseDeoptBundles + ? iterator_range<const Use *>(GetDeoptBundleOperands(CS)) + : iterator_range<const Use *>(Statepoint(CS).vm_state_args()); + + for (Value *Arg : DeoptStateRange) { assert(!isUnhandledGCPointerType(Arg->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(Arg->getType())) DeoptValues.push_back(Arg); } - insertUseHolderAfter(CS, DeoptValues, holders); - } - SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; - records.reserve(toUpdate.size()); - for (size_t i = 0; i < toUpdate.size(); i++) { - struct PartiallyConstructedSafepointRecord info; - records.push_back(info); + insertUseHolderAfter(CS, DeoptValues, Holders); } - assert(records.size() == toUpdate.size()); - // A) Identify all gc pointers which are staticly live at the given call + SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size()); + + // A) Identify all gc pointers which are statically live at the given call // site. - findLiveReferences(F, DT, P, toUpdate, records); + findLiveReferences(F, DT, ToUpdate, Records); // B) Find the base pointers for each live pointer /* scope for caching */ { @@ -2150,10 +2336,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // large numbers of duplicate base_phis. DefiningValueMapTy DVCache; - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - findBasePointers(DT, DVCache, CS, info); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &info = Records[i]; + findBasePointers(DT, DVCache, ToUpdate[i], info); } } // end of cache scope @@ -2170,63 +2355,75 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // the base pointers which were identified for that safepoint. We'll then // ask liveness for _every_ base inserted to see what is now live. Then we // remove the dummy calls. - holders.reserve(holders.size() + records.size()); - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; + Holders.reserve(Holders.size() + Records.size()); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; SmallVector<Value *, 128> Bases; - for (auto Pair : info.PointerToBase) { + for (auto Pair : Info.PointerToBase) Bases.push_back(Pair.second); - } - insertUseHolderAfter(CS, Bases, holders); + + insertUseHolderAfter(ToUpdate[i], Bases, Holders); } // By selecting base pointers, we've effectively inserted new uses. Thus, we // need to rerun liveness. We may *also* have inserted new defs, but that's // not the key issue. - recomputeLiveInValues(F, DT, P, toUpdate, records); + recomputeLiveInValues(F, DT, ToUpdate, Records); if (PrintBasePointers) { - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; + for (auto &Info : Records) { errs() << "Base Pairs: (w/Relocation)\n"; - for (auto Pair : info.PointerToBase) { - errs() << " derived %" << Pair.first->getName() << " base %" - << Pair.second->getName() << "\n"; + for (auto Pair : Info.PointerToBase) { + errs() << " derived "; + Pair.first->printAsOperand(errs(), false); + errs() << " base "; + Pair.second->printAsOperand(errs(), false); + errs() << "\n"; } } } - for (size_t i = 0; i < holders.size(); i++) { - holders[i]->eraseFromParent(); - holders[i] = nullptr; - } - holders.clear(); + + // It is possible that non-constant live variables have a constant base. For + // example, a GEP with a variable offset from a global. In this case we can + // remove it from the liveset. We already don't add constants to the liveset + // because we assume they won't move at runtime and the GC doesn't need to be + // informed about them. The same reasoning applies if the base is constant. + // Note that the relocation placement code relies on this filtering for + // correctness as it expects the base to be in the liveset, which isn't true + // if the base is constant. + for (auto &Info : Records) + for (auto &BasePair : Info.PointerToBase) + if (isa<Constant>(BasePair.second)) + Info.LiveSet.erase(BasePair.first); + + for (CallInst *CI : Holders) + CI->eraseFromParent(); + + Holders.clear(); // Do a limited scalarization of any live at safepoint vector values which // contain pointers. This enables this pass to run after vectorization at // the cost of some possible performance loss. TODO: it would be nice to // natively support vectors all the way through the backend so we don't need // to scalarize here. - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - Instruction *statepoint = toUpdate[i].getInstruction(); - splitVectorValues(cast<Instruction>(statepoint), info.liveset, - info.PointerToBase, DT); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; + Instruction *Statepoint = ToUpdate[i].getInstruction(); + splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet, + Info.PointerToBase, DT); } // In order to reduce live set of statepoint we might choose to rematerialize - // some values instead of relocating them. This is purelly an optimization and + // some values instead of relocating them. This is purely an optimization and // does not influence correctness. - TargetTransformInfo &TTI = - P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + for (size_t i = 0; i < Records.size(); i++) + rematerializeLiveValues(ToUpdate[i], Records[i], TTI); - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - - rematerializeLiveValues(CS, info, TTI); - } + // We need this to safely RAUW and delete call or invoke return values that + // may themselves be live over a statepoint. For details, please see usage in + // makeStatepointExplicitImpl. + std::vector<DeferredReplacement> Replacements; // Now run through and replace the existing statepoints with new ones with // the live variables listed. We do not yet update uses of the values being @@ -2234,61 +2431,77 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // survive to the last iteration of this loop. (By construction, the // previous statepoint can not be a live variable, thus we can and remove // the old statepoint calls as we go.) - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - makeStatepointExplicit(DT, CS, P, info); + for (size_t i = 0; i < Records.size(); i++) + makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements); + + ToUpdate.clear(); // prevent accident use of invalid CallSites + + for (auto &PR : Replacements) + PR.doReplacement(); + + Replacements.clear(); + + for (auto &Info : Records) { + // These live sets may contain state Value pointers, since we replaced calls + // with operand bundles with calls wrapped in gc.statepoint, and some of + // those calls may have been def'ing live gc pointers. Clear these out to + // avoid accidentally using them. + // + // TODO: We should create a separate data structure that does not contain + // these live sets, and migrate to using that data structure from this point + // onward. + Info.LiveSet.clear(); + Info.PointerToBase.clear(); } - toUpdate.clear(); // prevent accident use of invalid CallSites // Do all the fixups of the original live variables to their relocated selves - SmallVector<Value *, 128> live; - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; + SmallVector<Value *, 128> Live; + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; + // We can't simply save the live set from the original insertion. One of // the live values might be the result of a call which needs a safepoint. // That Value* no longer exists and we need to use the new gc_result. - // Thankfully, the liveset is embedded in the statepoint (and updated), so + // Thankfully, the live set is embedded in the statepoint (and updated), so // we just grab that. - Statepoint statepoint(info.StatepointToken); - live.insert(live.end(), statepoint.gc_args_begin(), - statepoint.gc_args_end()); + Statepoint Statepoint(Info.StatepointToken); + Live.insert(Live.end(), Statepoint.gc_args_begin(), + Statepoint.gc_args_end()); #ifndef NDEBUG // Do some basic sanity checks on our liveness results before performing // relocation. Relocation can and will turn mistakes in liveness results // into non-sensical code which is must harder to debug. // TODO: It would be nice to test consistency as well - assert(DT.isReachableFromEntry(info.StatepointToken->getParent()) && + assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) && "statepoint must be reachable or liveness is meaningless"); - for (Value *V : statepoint.gc_args()) { + for (Value *V : Statepoint.gc_args()) { if (!isa<Instruction>(V)) // Non-instruction values trivial dominate all possible uses continue; - auto LiveInst = cast<Instruction>(V); + auto *LiveInst = cast<Instruction>(V); assert(DT.isReachableFromEntry(LiveInst->getParent()) && "unreachable values should never be live"); - assert(DT.dominates(LiveInst, info.StatepointToken) && + assert(DT.dominates(LiveInst, Info.StatepointToken) && "basic SSA liveness expectation violated by liveness analysis"); } #endif } - unique_unsorted(live); + unique_unsorted(Live); #ifndef NDEBUG // sanity check - for (auto ptr : live) { - assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); - } + for (auto *Ptr : Live) + assert(isGCPointerType(Ptr->getType()) && "must be a gc pointer type"); #endif - relocationViaAlloca(F, DT, live, records); - return !records.empty(); + relocationViaAlloca(F, DT, Live, Records); + return !Records.empty(); } // Handles both return values and arguments for Functions and CallSites. template <typename AttrHolder> -static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, - unsigned Index) { +static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, + unsigned Index) { AttrBuilder R; if (AH.getDereferenceableBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, @@ -2296,6 +2509,8 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, if (AH.getDereferenceableOrNullBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, AH.getDereferenceableOrNullBytes(Index))); + if (AH.doesNotAlias(Index)) + R.addAttribute(Attribute::NoAlias); if (!R.empty()) AH.setAttributes(AH.getAttributes().removeAttributes( @@ -2303,25 +2518,25 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, } void -RewriteStatepointsForGC::stripDereferenceabilityInfoFromPrototype(Function &F) { +RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { LLVMContext &Ctx = F.getContext(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) - RemoveDerefAttrAtIndex(Ctx, F, A.getArgNo() + 1); + RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1); if (isa<PointerType>(F.getReturnType())) - RemoveDerefAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); } -void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2344,9 +2559,9 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { if (CallSite CS = CallSite(&I)) { for (int i = 0, e = CS.arg_size(); i != e; i++) if (isa<PointerType>(CS.getArgument(i)->getType())) - RemoveDerefAttrAtIndex(Ctx, CS, i + 1); + RemoveNonValidAttrAtIndex(Ctx, CS, i + 1); if (isa<PointerType>(CS.getType())) - RemoveDerefAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); } } } @@ -2365,17 +2580,17 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripDereferenceabilityInfo(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { #ifndef NDEBUG assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) && "precondition!"); #endif for (Function &F : M) - stripDereferenceabilityInfoFromPrototype(F); + stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripDereferenceabilityInfoFromBody(F); + stripNonValidAttributesFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { @@ -2389,15 +2604,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { return false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + auto NeedsRewrite = [](Instruction &I) { + if (UseDeoptBundles) { + if (ImmutableCallSite CS = ImmutableCallSite(&I)) + return !callsGCLeafFunction(CS); + return false; + } + + return isStatepoint(I); + }; // Gather all the statepoints which need rewritten. Be careful to only // consider those in reachable code since we need to ask dominance queries // when rewriting. We'll delete the unreachable ones in a moment. SmallVector<CallSite, 64> ParsePointNeeded; bool HasUnreachableStatepoint = false; - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { // TODO: only the ones with the flag set! - if (isStatepoint(I)) { + if (NeedsRewrite(I)) { if (DT.isReachableFromEntry(I.getParent())) ParsePointNeeded.push_back(CallSite(&I)); else @@ -2428,7 +2655,38 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { FoldSingleEntryPHINodes(&BB); } - MadeChange |= insertParsePoints(F, DT, this, ParsePointNeeded); + // Before we start introducing relocations, we want to tweak the IR a bit to + // avoid unfortunate code generation effects. The main example is that we + // want to try to make sure the comparison feeding a branch is after any + // safepoints. Otherwise, we end up with a comparison of pre-relocation + // values feeding a branch after relocation. This is semantically correct, + // but results in extra register pressure since both the pre-relocation and + // post-relocation copies must be available in registers. For code without + // relocations this is handled elsewhere, but teaching the scheduler to + // reverse the transform we're about to do would be slightly complex. + // Note: This may extend the live range of the inputs to the icmp and thus + // increase the liveset of any statepoint we move over. This is profitable + // as long as all statepoints are in rare blocks. If we had in-register + // lowering for live values this would be a much safer transform. + auto getConditionInst = [](TerminatorInst *TI) -> Instruction* { + if (auto *BI = dyn_cast<BranchInst>(TI)) + if (BI->isConditional()) + return dyn_cast<Instruction>(BI->getCondition()); + // TODO: Extend this to handle switches + return nullptr; + }; + for (BasicBlock &BB : F) { + TerminatorInst *TI = BB.getTerminator(); + if (auto *Cond = getConditionInst(TI)) + // TODO: Handle more than just ICmps here. We should be able to move + // most instructions without side effects or memory access. + if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) { + MadeChange = true; + Cond->moveBefore(TI); + } + } + + MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded); return MadeChange; } @@ -2461,7 +2719,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, "support for FCA unimplemented"); if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { // The choice to exclude all things constant here is slightly subtle. - // There are two idependent reasons: + // There are two independent reasons: // - We assume that things which are constant (from LLVM's definition) // do not move at runtime. For example, the address of a global // variable is fixed, even though it's contents may not be. @@ -2599,7 +2857,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F, } // while( !worklist.empty() ) #ifndef NDEBUG - // Sanity check our ouput against SSA properties. This helps catch any + // Sanity check our output against SSA properties. This helps catch any // missing kills during the above iteration. for (BasicBlock &BB : F) { checkBasicSSA(DT, Data, BB); @@ -2620,7 +2878,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, // call result is not live (normal), nor are it's arguments // (unless they're used again later). This adjustment is // specifically what we need to relocate - BasicBlock::reverse_iterator rend(Inst); + BasicBlock::reverse_iterator rend(Inst->getIterator()); computeLiveInValues(BB->rbegin(), rend, LiveOut); LiveOut.erase(Inst); Out.insert(LiveOut.begin(), LiveOut.end()); @@ -2669,5 +2927,5 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, assert(Updated.count(KVPair.first) && "record for non-live value"); #endif - Info.liveset = Updated; + Info.LiveSet = Updated; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 4d3a708fa20e..2fca803adde8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" @@ -479,6 +480,13 @@ private: void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } + void visitFuncletPadInst(FuncletPadInst &FPI) { + markAnythingOverdefined(&FPI); + } + void visitCatchSwitchInst(CatchSwitchInst &CPI) { + markAnythingOverdefined(&CPI); + visitTerminatorInst(CPI); + } // Instructions that cannot be folded away. void visitStoreInst (StoreInst &I); @@ -539,9 +547,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, return; } - if (isa<InvokeInst>(TI)) { - // Invoke instructions successors are always executable. - Succs[0] = Succs[1] = true; + // Unwinding instructions successors are always executable. + if (TI.isExceptional()) { + Succs.assign(TI.getNumSuccessors(), true); return; } @@ -605,8 +613,8 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { return BI->getSuccessor(CI->isZero()) == To; } - // Invoke instructions successors are always executable. - if (isa<InvokeInst>(TI)) + // Unwinding instructions successors are always executable. + if (TI->isExceptional()) return true; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { @@ -630,7 +638,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { #ifndef NDEBUG dbgs() << "Unknown terminator instruction: " << *TI << '\n'; #endif - llvm_unreachable(nullptr); + llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } // visit Implementations - Something changed in this instruction, either an @@ -1126,7 +1134,7 @@ CallOverdefined: // entry block executable and merge in the actual arguments to the call into // the formal arguments of the function. if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){ - MarkBlockExecutable(F->begin()); + MarkBlockExecutable(&F->front()); // Propagate information from this call site into the callee. CallSite::arg_iterator CAI = CS.arg_begin(); @@ -1135,17 +1143,17 @@ CallOverdefined: // If this argument is byval, and if the function is not readonly, there // will be an implicit copy formed of the input aggregate. if (AI->hasByValAttr() && !F->onlyReadsMemory()) { - markOverdefined(AI); + markOverdefined(&*AI); continue; } if (StructType *STy = dyn_cast<StructType>(AI->getType())) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { LatticeVal CallArg = getStructValueState(*CAI, i); - mergeInValue(getStructValueState(AI, i), AI, CallArg); + mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg); } } else { - mergeInValue(AI, getValueState(*CAI)); + mergeInValue(&*AI, getValueState(*CAI)); } } } @@ -1246,18 +1254,18 @@ void SCCPSolver::Solve() { /// even if X isn't defined. bool SCCPSolver::ResolvedUndefsIn(Function &F) { for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!BBExecutable.count(BB)) + if (!BBExecutable.count(&*BB)) continue; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Look for instructions which produce undef values. - if (I->getType()->isVoidTy()) continue; + if (I.getType()->isVoidTy()) continue; - if (StructType *STy = dyn_cast<StructType>(I->getType())) { + if (StructType *STy = dyn_cast<StructType>(I.getType())) { // Only a few things that can be structs matter for undef. // Tracked calls must never be marked overdefined in ResolvedUndefsIn. - if (CallSite CS = CallSite(I)) + if (CallSite CS = CallSite(&I)) if (Function *F = CS.getCalledFunction()) if (MRVFunctionsTracked.count(F)) continue; @@ -1270,14 +1278,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Send the results of everything else to overdefined. We could be // more precise than this but it isn't worth bothering. for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - LatticeVal &LV = getStructValueState(I, i); + LatticeVal &LV = getStructValueState(&I, i); if (LV.isUndefined()) - markOverdefined(LV, I); + markOverdefined(LV, &I); } continue; } - LatticeVal &LV = getValueState(I); + LatticeVal &LV = getValueState(&I); if (!LV.isUndefined()) continue; // extractvalue is safe; check here because the argument is a struct. @@ -1287,24 +1295,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Compute the operand LatticeVals, for convenience below. // Anything taking a struct is conservatively assumed to require // overdefined markings. - if (I->getOperand(0)->getType()->isStructTy()) { - markOverdefined(I); + if (I.getOperand(0)->getType()->isStructTy()) { + markOverdefined(&I); return true; } - LatticeVal Op0LV = getValueState(I->getOperand(0)); + LatticeVal Op0LV = getValueState(I.getOperand(0)); LatticeVal Op1LV; - if (I->getNumOperands() == 2) { - if (I->getOperand(1)->getType()->isStructTy()) { - markOverdefined(I); + if (I.getNumOperands() == 2) { + if (I.getOperand(1)->getType()->isStructTy()) { + markOverdefined(&I); return true; } - Op1LV = getValueState(I->getOperand(1)); + Op1LV = getValueState(I.getOperand(1)); } // If this is an instructions whose result is defined even if the input is // not fully defined, propagate the information. - Type *ITy = I->getType(); - switch (I->getOpcode()) { + Type *ITy = I.getType(); + switch (I.getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Trunc: @@ -1318,9 +1326,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::FRem: // Floating-point binary operation: be conservative. if (Op0LV.isUndefined() && Op1LV.isUndefined()) - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); else - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::ZExt: case Instruction::SExt: @@ -1332,7 +1340,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::SIToFP: case Instruction::UIToFP: // undef -> 0; some outputs are impossible - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: case Instruction::And: @@ -1341,7 +1349,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; // undef * X -> 0. X could be zero. // undef & X -> 0. X could be zero. - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Or: @@ -1349,7 +1357,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (Op0LV.isUndefined() && Op1LV.isUndefined()) break; // undef | X -> -1. X could be -1. - markForcedConstant(I, Constant::getAllOnesValue(ITy)); + markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; case Instruction::Xor: @@ -1357,7 +1365,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // necessary, but we try to be nice to people who expect this // behavior in simple cases if (Op0LV.isUndefined() && Op1LV.isUndefined()) { - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; } // undef ^ X -> undef @@ -1373,7 +1381,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef / X -> 0. X could be maxint. // undef % X -> 0. X could be 1. - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::AShr: @@ -1381,7 +1389,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (Op1LV.isUndefined()) break; // undef >>a X -> all ones - markForcedConstant(I, Constant::getAllOnesValue(ITy)); + markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; case Instruction::LShr: case Instruction::Shl: @@ -1391,17 +1399,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef << X -> 0 // undef >> X -> 0 - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Select: - Op1LV = getValueState(I->getOperand(1)); + Op1LV = getValueState(I.getOperand(1)); // undef ? X : Y -> X or Y. There could be commonality between X/Y. if (Op0LV.isUndefined()) { if (!Op1LV.isConstant()) // Pick the constant one if there is any. - Op1LV = getValueState(I->getOperand(2)); + Op1LV = getValueState(I.getOperand(2)); } else if (Op1LV.isUndefined()) { // c ? undef : undef -> undef. No change. - Op1LV = getValueState(I->getOperand(2)); + Op1LV = getValueState(I.getOperand(2)); if (Op1LV.isUndefined()) break; // Otherwise, c ? undef : x -> x. @@ -1410,9 +1418,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } if (Op1LV.isConstant()) - markForcedConstant(I, Op1LV.getConstant()); + markForcedConstant(&I, Op1LV.getConstant()); else - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::Load: // A load here means one of two things: a load of undef from a global, @@ -1421,9 +1429,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; case Instruction::ICmp: // X == undef -> undef. Other comparisons get more complicated. - if (cast<ICmpInst>(I)->isEquality()) + if (cast<ICmpInst>(&I)->isEquality()) break; - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::Call: case Instruction::Invoke: { @@ -1432,19 +1440,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // 2. It could be constant-foldable. // Because of the way we solve return values, tracked calls must // never be marked overdefined in ResolvedUndefsIn. - if (Function *F = CallSite(I).getCalledFunction()) + if (Function *F = CallSite(&I).getCalledFunction()) if (TrackedRetVals.count(F)) break; // If the call is constant-foldable, we mark it overdefined because // we do not know what return values are valid. - markOverdefined(I); + markOverdefined(&I); return true; } default: // If we don't know what should happen here, conservatively mark it // overdefined. - markOverdefined(I); + markOverdefined(&I); return true; } } @@ -1462,7 +1470,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // false. if (isa<UndefValue>(BI->getCondition())) { BI->setCondition(ConstantInt::getFalse(BI->getContext())); - markEdgeExecutable(BB, TI->getSuccessor(1)); + markEdgeExecutable(&*BB, TI->getSuccessor(1)); return true; } @@ -1484,7 +1492,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // the first constant. if (isa<UndefValue>(SI->getCondition())) { SI->setCondition(SI->case_begin().getCaseValue()); - markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor()); + markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor()); return true; } @@ -1506,6 +1514,7 @@ namespace { struct SCCP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } static char ID; // Pass identification, replacement for typeid SCCP() : FunctionPass(ID) { @@ -1541,11 +1550,10 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. while (EndInst != BB->begin()) { // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; + Instruction *Inst = &*--EndInst->getIterator(); if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { + if (Inst->isEHPad()) { EndInst = Inst; continue; } @@ -1568,11 +1576,11 @@ bool SCCP::runOnFunction(Function &F) { SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. - Solver.MarkBlockExecutable(F.begin()); + Solver.MarkBlockExecutable(&F.front()); // Mark all arguments to the function as being overdefined. - for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI) - Solver.markAnythingOverdefined(AI); + for (Argument &AI : F.args()) + Solver.markAnythingOverdefined(&AI); // Solve for constants. bool ResolvedUndefs = true; @@ -1589,8 +1597,8 @@ bool SCCP::runOnFunction(Function &F) { // as we cannot modify the CFG of the function. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(BB)) { - DeleteInstructionInBlock(BB); + if (!Solver.isBlockExecutable(&*BB)) { + DeleteInstructionInBlock(&*BB); MadeChanges = true; continue; } @@ -1599,7 +1607,7 @@ bool SCCP::runOnFunction(Function &F) { // constants if we have found them to be of constant values. // for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { - Instruction *Inst = BI++; + Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) continue; @@ -1713,36 +1721,34 @@ bool IPSCCP::runOnModule(Module &M) { // If this is a strong or ODR definition of this function, then we can // propagate information about its result into callsites of it. if (!F->mayBeOverridden()) - Solver.AddTrackedFunction(F); + Solver.AddTrackedFunction(&*F); // If this function only has direct calls that we can see, we can track its // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. if (F->hasLocalLinkage()) { - if (AddressIsTaken(F)) - AddressTakenFunctions.insert(F); + if (AddressIsTaken(&*F)) + AddressTakenFunctions.insert(&*F); else { - Solver.AddArgumentTrackedFunction(F); + Solver.AddArgumentTrackedFunction(&*F); continue; } } // Assume the function is called. - Solver.MarkBlockExecutable(F->begin()); + Solver.MarkBlockExecutable(&F->front()); // Assume nothing about the incoming arguments. - for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); - AI != E; ++AI) - Solver.markAnythingOverdefined(AI); + for (Argument &AI : F->args()) + Solver.markAnythingOverdefined(&AI); } // Loop over global variables. We inform the solver about any internal global // variables that do not have their 'addresses taken'. If they don't have // their addresses taken, we can propagate constants through them. - for (Module::global_iterator G = M.global_begin(), E = M.global_end(); - G != E; ++G) - if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G)) - Solver.TrackValueOfGlobalVariable(G); + for (GlobalVariable &G : M.globals()) + if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G)) + Solver.TrackValueOfGlobalVariable(&G); // Solve for constants. bool ResolvedUndefs = true; @@ -1763,7 +1769,10 @@ bool IPSCCP::runOnModule(Module &M) { SmallVector<BasicBlock*, 512> BlocksToErase; for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (Solver.isBlockExecutable(F->begin())) { + if (F->isDeclaration()) + continue; + + if (Solver.isBlockExecutable(&F->front())) { for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI) { if (AI->use_empty() || AI->getType()->isStructTy()) continue; @@ -1771,7 +1780,7 @@ bool IPSCCP::runOnModule(Module &M) { // TODO: Could use getStructLatticeValueFor to find out if the entire // result is a constant and replace it entirely if so. - LatticeVal IV = Solver.getLatticeValueFor(AI); + LatticeVal IV = Solver.getLatticeValueFor(&*AI); if (IV.isOverdefined()) continue; Constant *CST = IV.isConstant() ? @@ -1786,28 +1795,27 @@ bool IPSCCP::runOnModule(Module &M) { } for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(BB)) { - DeleteInstructionInBlock(BB); + if (!Solver.isBlockExecutable(&*BB)) { + DeleteInstructionInBlock(&*BB); MadeChanges = true; TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = TI->getSuccessor(i); + for (BasicBlock *Succ : TI->successors()) { if (!Succ->empty() && isa<PHINode>(Succ->begin())) - TI->getSuccessor(i)->removePredecessor(BB); + Succ->removePredecessor(&*BB); } if (!TI->use_empty()) TI->replaceAllUsesWith(UndefValue::get(TI->getType())); TI->eraseFromParent(); - new UnreachableInst(M.getContext(), BB); + new UnreachableInst(M.getContext(), &*BB); if (&*BB != &F->front()) - BlocksToErase.push_back(BB); + BlocksToErase.push_back(&*BB); continue; } for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { - Instruction *Inst = BI++; + Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy()) continue; diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 947513a36572..a7361b5fe083 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -23,12 +23,12 @@ /// //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SROA.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -37,8 +37,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -53,9 +51,9 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/TimeValue.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #if __cplusplus >= 201103L && !defined(NDEBUG) // We only use this for a debug check in C++11 @@ -63,6 +61,7 @@ #endif using namespace llvm; +using namespace llvm::sroa; #define DEBUG_TYPE "sroa" @@ -77,11 +76,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); -/// Hidden option to force the pass to not use DomTree and mem2reg, instead -/// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), - cl::Hidden); - /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", @@ -205,7 +199,6 @@ template <typename T> struct isPodLike; template <> struct isPodLike<Slice> { static const bool value = true; }; } -namespace { /// \brief Representation of the alloca slices. /// /// This class represents the slices of an alloca which are formed by its @@ -213,7 +206,7 @@ namespace { /// for the slices used and we reflect that in this structure. The uses are /// stored, sorted by increasing beginning offset and with unsplittable slices /// starting at a particular offset before splittable slices. -class AllocaSlices { +class llvm::sroa::AllocaSlices { public: /// \brief Construct the slices of a particular alloca. AllocaSlices(const DataLayout &DL, AllocaInst &AI); @@ -253,281 +246,10 @@ public: std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } - // Forward declare an iterator to befriend it. + // Forward declare the iterator and range accessor for walking the + // partitions. class partition_iterator; - - /// \brief A partition of the slices. - /// - /// An ephemeral representation for a range of slices which can be viewed as - /// a partition of the alloca. This range represents a span of the alloca's - /// memory which cannot be split, and provides access to all of the slices - /// overlapping some part of the partition. - /// - /// Objects of this type are produced by traversing the alloca's slices, but - /// are only ephemeral and not persistent. - class Partition { - private: - friend class AllocaSlices; - friend class AllocaSlices::partition_iterator; - - /// \brief The begining and ending offsets of the alloca for this partition. - uint64_t BeginOffset, EndOffset; - - /// \brief The start end end iterators of this partition. - iterator SI, SJ; - - /// \brief A collection of split slice tails overlapping the partition. - SmallVector<Slice *, 4> SplitTails; - - /// \brief Raw constructor builds an empty partition starting and ending at - /// the given iterator. - Partition(iterator SI) : SI(SI), SJ(SI) {} - - public: - /// \brief The start offset of this partition. - /// - /// All of the contained slices start at or after this offset. - uint64_t beginOffset() const { return BeginOffset; } - - /// \brief The end offset of this partition. - /// - /// All of the contained slices end at or before this offset. - uint64_t endOffset() const { return EndOffset; } - - /// \brief The size of the partition. - /// - /// Note that this can never be zero. - uint64_t size() const { - assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); - return EndOffset - BeginOffset; - } - - /// \brief Test whether this partition contains no slices, and merely spans - /// a region occupied by split slices. - bool empty() const { return SI == SJ; } - - /// \name Iterate slices that start within the partition. - /// These may be splittable or unsplittable. They have a begin offset >= the - /// partition begin offset. - /// @{ - // FIXME: We should probably define a "concat_iterator" helper and use that - // to stitch together pointee_iterators over the split tails and the - // contiguous iterators of the partition. That would give a much nicer - // interface here. We could then additionally expose filtered iterators for - // split, unsplit, and unsplittable splices based on the usage patterns. - iterator begin() const { return SI; } - iterator end() const { return SJ; } - /// @} - - /// \brief Get the sequence of split slice tails. - /// - /// These tails are of slices which start before this partition but are - /// split and overlap into the partition. We accumulate these while forming - /// partitions. - ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } - }; - - /// \brief An iterator over partitions of the alloca's slices. - /// - /// This iterator implements the core algorithm for partitioning the alloca's - /// slices. It is a forward iterator as we don't support backtracking for - /// efficiency reasons, and re-use a single storage area to maintain the - /// current set of split slices. - /// - /// It is templated on the slice iterator type to use so that it can operate - /// with either const or non-const slice iterators. - class partition_iterator - : public iterator_facade_base<partition_iterator, - std::forward_iterator_tag, Partition> { - friend class AllocaSlices; - - /// \brief Most of the state for walking the partitions is held in a class - /// with a nice interface for examining them. - Partition P; - - /// \brief We need to keep the end of the slices to know when to stop. - AllocaSlices::iterator SE; - - /// \brief We also need to keep track of the maximum split end offset seen. - /// FIXME: Do we really? - uint64_t MaxSplitSliceEndOffset; - - /// \brief Sets the partition to be empty at given iterator, and sets the - /// end iterator. - partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) - : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { - // If not already at the end, advance our state to form the initial - // partition. - if (SI != SE) - advance(); - } - - /// \brief Advance the iterator to the next partition. - /// - /// Requires that the iterator not be at the end of the slices. - void advance() { - assert((P.SI != SE || !P.SplitTails.empty()) && - "Cannot advance past the end of the slices!"); - - // Clear out any split uses which have ended. - if (!P.SplitTails.empty()) { - if (P.EndOffset >= MaxSplitSliceEndOffset) { - // If we've finished all splits, this is easy. - P.SplitTails.clear(); - MaxSplitSliceEndOffset = 0; - } else { - // Remove the uses which have ended in the prior partition. This - // cannot change the max split slice end because we just checked that - // the prior partition ended prior to that max. - P.SplitTails.erase( - std::remove_if( - P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), - P.SplitTails.end()); - assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { - return S->endOffset() == MaxSplitSliceEndOffset; - }) && - "Could not find the current max split slice offset!"); - assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { - return S->endOffset() <= MaxSplitSliceEndOffset; - }) && - "Max split slice end offset is not actually the max!"); - } - } - - // If P.SI is already at the end, then we've cleared the split tail and - // now have an end iterator. - if (P.SI == SE) { - assert(P.SplitTails.empty() && "Failed to clear the split slices!"); - return; - } - - // If we had a non-empty partition previously, set up the state for - // subsequent partitions. - if (P.SI != P.SJ) { - // Accumulate all the splittable slices which started in the old - // partition into the split list. - for (Slice &S : P) - if (S.isSplittable() && S.endOffset() > P.EndOffset) { - P.SplitTails.push_back(&S); - MaxSplitSliceEndOffset = - std::max(S.endOffset(), MaxSplitSliceEndOffset); - } - - // Start from the end of the previous partition. - P.SI = P.SJ; - - // If P.SI is now at the end, we at most have a tail of split slices. - if (P.SI == SE) { - P.BeginOffset = P.EndOffset; - P.EndOffset = MaxSplitSliceEndOffset; - return; - } - - // If the we have split slices and the next slice is after a gap and is - // not splittable immediately form an empty partition for the split - // slices up until the next slice begins. - if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && - !P.SI->isSplittable()) { - P.BeginOffset = P.EndOffset; - P.EndOffset = P.SI->beginOffset(); - return; - } - } - - // OK, we need to consume new slices. Set the end offset based on the - // current slice, and step SJ past it. The beginning offset of the - // parttion is the beginning offset of the next slice unless we have - // pre-existing split slices that are continuing, in which case we begin - // at the prior end offset. - P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; - P.EndOffset = P.SI->endOffset(); - ++P.SJ; - - // There are two strategies to form a partition based on whether the - // partition starts with an unsplittable slice or a splittable slice. - if (!P.SI->isSplittable()) { - // When we're forming an unsplittable region, it must always start at - // the first slice and will extend through its end. - assert(P.BeginOffset == P.SI->beginOffset()); - - // Form a partition including all of the overlapping slices with this - // unsplittable slice. - while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { - if (!P.SJ->isSplittable()) - P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); - ++P.SJ; - } - - // We have a partition across a set of overlapping unsplittable - // partitions. - return; - } - - // If we're starting with a splittable slice, then we need to form - // a synthetic partition spanning it and any other overlapping splittable - // splices. - assert(P.SI->isSplittable() && "Forming a splittable partition!"); - - // Collect all of the overlapping splittable slices. - while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && - P.SJ->isSplittable()) { - P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); - ++P.SJ; - } - - // Back upiP.EndOffset if we ended the span early when encountering an - // unsplittable slice. This synthesizes the early end offset of - // a partition spanning only splittable slices. - if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { - assert(!P.SJ->isSplittable()); - P.EndOffset = P.SJ->beginOffset(); - } - } - - public: - bool operator==(const partition_iterator &RHS) const { - assert(SE == RHS.SE && - "End iterators don't match between compared partition iterators!"); - - // The observed positions of partitions is marked by the P.SI iterator and - // the emptyness of the split slices. The latter is only relevant when - // P.SI == SE, as the end iterator will additionally have an empty split - // slices list, but the prior may have the same P.SI and a tail of split - // slices. - if (P.SI == RHS.P.SI && - P.SplitTails.empty() == RHS.P.SplitTails.empty()) { - assert(P.SJ == RHS.P.SJ && - "Same set of slices formed two different sized partitions!"); - assert(P.SplitTails.size() == RHS.P.SplitTails.size() && - "Same slice position with differently sized non-empty split " - "slice tails!"); - return true; - } - return false; - } - - partition_iterator &operator++() { - advance(); - return *this; - } - - Partition &operator*() { return P; } - }; - - /// \brief A forward range over the partitions of the alloca's slices. - /// - /// This accesses an iterator range over the partitions of the alloca's - /// slices. It computes these partitions on the fly based on the overlapping - /// offsets of the slices and the ability to split them. It will visit "empty" - /// partitions to cover regions of the alloca only accessed via split - /// slices. - iterator_range<partition_iterator> partitions() { - return make_range(partition_iterator(begin(), end()), - partition_iterator(end(), end())); - } + iterator_range<partition_iterator> partitions(); /// \brief Access the dead users for this alloca. ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } @@ -595,6 +317,280 @@ private: /// the alloca. SmallVector<Use *, 8> DeadOperands; }; + +/// \brief A partition of the slices. +/// +/// An ephemeral representation for a range of slices which can be viewed as +/// a partition of the alloca. This range represents a span of the alloca's +/// memory which cannot be split, and provides access to all of the slices +/// overlapping some part of the partition. +/// +/// Objects of this type are produced by traversing the alloca's slices, but +/// are only ephemeral and not persistent. +class llvm::sroa::Partition { +private: + friend class AllocaSlices; + friend class AllocaSlices::partition_iterator; + + typedef AllocaSlices::iterator iterator; + + /// \brief The beginning and ending offsets of the alloca for this + /// partition. + uint64_t BeginOffset, EndOffset; + + /// \brief The start end end iterators of this partition. + iterator SI, SJ; + + /// \brief A collection of split slice tails overlapping the partition. + SmallVector<Slice *, 4> SplitTails; + + /// \brief Raw constructor builds an empty partition starting and ending at + /// the given iterator. + Partition(iterator SI) : SI(SI), SJ(SI) {} + +public: + /// \brief The start offset of this partition. + /// + /// All of the contained slices start at or after this offset. + uint64_t beginOffset() const { return BeginOffset; } + + /// \brief The end offset of this partition. + /// + /// All of the contained slices end at or before this offset. + uint64_t endOffset() const { return EndOffset; } + + /// \brief The size of the partition. + /// + /// Note that this can never be zero. + uint64_t size() const { + assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); + return EndOffset - BeginOffset; + } + + /// \brief Test whether this partition contains no slices, and merely spans + /// a region occupied by split slices. + bool empty() const { return SI == SJ; } + + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. + /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. + iterator begin() const { return SI; } + iterator end() const { return SJ; } + /// @} + + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } +}; + +/// \brief An iterator over partitions of the alloca's slices. +/// +/// This iterator implements the core algorithm for partitioning the alloca's +/// slices. It is a forward iterator as we don't support backtracking for +/// efficiency reasons, and re-use a single storage area to maintain the +/// current set of split slices. +/// +/// It is templated on the slice iterator type to use so that it can operate +/// with either const or non-const slice iterators. +class AllocaSlices::partition_iterator + : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, + Partition> { + friend class AllocaSlices; + + /// \brief Most of the state for walking the partitions is held in a class + /// with a nice interface for examining them. + Partition P; + + /// \brief We need to keep the end of the slices to know when to stop. + AllocaSlices::iterator SE; + + /// \brief We also need to keep track of the maximum split end offset seen. + /// FIXME: Do we really? + uint64_t MaxSplitSliceEndOffset; + + /// \brief Sets the partition to be empty at given iterator, and sets the + /// end iterator. + partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) + : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + // If not already at the end, advance our state to form the initial + // partition. + if (SI != SE) + advance(); + } + + /// \brief Advance the iterator to the next partition. + /// + /// Requires that the iterator not be at the end of the slices. + void advance() { + assert((P.SI != SE || !P.SplitTails.empty()) && + "Cannot advance past the end of the slices!"); + + // Clear out any split uses which have ended. + if (!P.SplitTails.empty()) { + if (P.EndOffset >= MaxSplitSliceEndOffset) { + // If we've finished all splits, this is easy. + P.SplitTails.clear(); + MaxSplitSliceEndOffset = 0; + } else { + // Remove the uses which have ended in the prior partition. This + // cannot change the max split slice end because we just checked that + // the prior partition ended prior to that max. + P.SplitTails.erase( + std::remove_if( + P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && + "Could not find the current max split slice offset!"); + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && + "Max split slice end offset is not actually the max!"); + } + } + + // If P.SI is already at the end, then we've cleared the split tail and + // now have an end iterator. + if (P.SI == SE) { + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); + return; + } + + // If we had a non-empty partition previously, set up the state for + // subsequent partitions. + if (P.SI != P.SJ) { + // Accumulate all the splittable slices which started in the old + // partition into the split list. + for (Slice &S : P) + if (S.isSplittable() && S.endOffset() > P.EndOffset) { + P.SplitTails.push_back(&S); + MaxSplitSliceEndOffset = + std::max(S.endOffset(), MaxSplitSliceEndOffset); + } + + // Start from the end of the previous partition. + P.SI = P.SJ; + + // If P.SI is now at the end, we at most have a tail of split slices. + if (P.SI == SE) { + P.BeginOffset = P.EndOffset; + P.EndOffset = MaxSplitSliceEndOffset; + return; + } + + // If the we have split slices and the next slice is after a gap and is + // not splittable immediately form an empty partition for the split + // slices up until the next slice begins. + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && + !P.SI->isSplittable()) { + P.BeginOffset = P.EndOffset; + P.EndOffset = P.SI->beginOffset(); + return; + } + } + + // OK, we need to consume new slices. Set the end offset based on the + // current slice, and step SJ past it. The beginning offset of the + // partition is the beginning offset of the next slice unless we have + // pre-existing split slices that are continuing, in which case we begin + // at the prior end offset. + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; + P.EndOffset = P.SI->endOffset(); + ++P.SJ; + + // There are two strategies to form a partition based on whether the + // partition starts with an unsplittable slice or a splittable slice. + if (!P.SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at + // the first slice and will extend through its end. + assert(P.BeginOffset == P.SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + if (!P.SJ->isSplittable()) + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // We have a partition across a set of overlapping unsplittable + // partitions. + return; + } + + // If we're starting with a splittable slice, then we need to form + // a synthetic partition spanning it and any other overlapping splittable + // splices. + assert(P.SI->isSplittable() && "Forming a splittable partition!"); + + // Collect all of the overlapping splittable slices. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && + P.SJ->isSplittable()) { + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // Back upiP.EndOffset if we ended the span early when encountering an + // unsplittable slice. This synthesizes the early end offset of + // a partition spanning only splittable slices. + if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + assert(!P.SJ->isSplittable()); + P.EndOffset = P.SJ->beginOffset(); + } + } + +public: + bool operator==(const partition_iterator &RHS) const { + assert(SE == RHS.SE && + "End iterators don't match between compared partition iterators!"); + + // The observed positions of partitions is marked by the P.SI iterator and + // the emptiness of the split slices. The latter is only relevant when + // P.SI == SE, as the end iterator will additionally have an empty split + // slices list, but the prior may have the same P.SI and a tail of split + // slices. + if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { + assert(P.SJ == RHS.P.SJ && + "Same set of slices formed two different sized partitions!"); + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && + "Same slice position with differently sized non-empty split " + "slice tails!"); + return true; + } + return false; + } + + partition_iterator &operator++() { + advance(); + return *this; + } + + Partition &operator*() { return P; } +}; + +/// \brief A forward range over the partitions of the alloca's slices. +/// +/// This accesses an iterator range over the partitions of the alloca's +/// slices. It computes these partitions on the fly based on the overlapping +/// offsets of the slices and the ability to split them. It will visit "empty" +/// partitions to cover regions of the alloca only accessed via split +/// slices. +iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { + return make_range(partition_iterator(begin(), end()), + partition_iterator(end(), end())); } static Value *foldSelectInst(SelectInst &SI) { @@ -1072,217 +1068,6 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -namespace { -/// \brief Implementation of LoadAndStorePromoter for promoting allocas. -/// -/// This subclass of LoadAndStorePromoter adds overrides to handle promoting -/// the loads and stores of an alloca instruction, as well as updating its -/// debug information. This is used when a domtree is unavailable and thus -/// mem2reg in its full form can't be used to handle promotion of allocas to -/// scalar values. -class AllocaPromoter : public LoadAndStorePromoter { - AllocaInst &AI; - DIBuilder &DIB; - - SmallVector<DbgDeclareInst *, 4> DDIs; - SmallVector<DbgValueInst *, 4> DVIs; - -public: - AllocaPromoter(ArrayRef<const Instruction *> Insts, - SSAUpdater &S, - AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - - void run(const SmallVectorImpl<Instruction *> &Insts) { - // Retain the debug information attached to the alloca for use when - // rewriting loads and stores. - if (auto *L = LocalAsMetadata::getIfExists(&AI)) { - if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) { - for (User *U : DINode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); - } - } - - LoadAndStorePromoter::run(Insts); - - // While we have the debug information, clear it off of the alloca. The - // caller takes care of deleting the alloca. - while (!DDIs.empty()) - DDIs.pop_back_val()->eraseFromParent(); - while (!DVIs.empty()) - DVIs.pop_back_val()->eraseFromParent(); - } - - bool - isInstInList(Instruction *I, - const SmallVectorImpl<Instruction *> &Insts) const override { - Value *Ptr; - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - Ptr = LI->getOperand(0); - else - Ptr = cast<StoreInst>(I)->getPointerOperand(); - - // Only used to detect cycles, which will be rare and quickly found as - // we're walking up a chain of defs rather than down through uses. - SmallPtrSet<Value *, 4> Visited; - - do { - if (Ptr == &AI) - return true; - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) - Ptr = BCI->getOperand(0); - else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) - Ptr = GEPI->getPointerOperand(); - else - return false; - - } while (Visited.insert(Ptr).second); - - return false; - } - - void updateDebugInfo(Instruction *Inst) const override { - for (DbgDeclareInst *DDI : DDIs) - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - for (DbgValueInst *DVI : DVIs) { - Value *Arg = nullptr; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - // If an argument is zero extended then use argument directly. The ZExt - // may be zapped by an optimization pass in future. - if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(ZExt->getOperand(0)); - else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(SExt->getOperand(0)); - if (!Arg) - Arg = SI->getValueOperand(); - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Arg = LI->getPointerOperand(); - } else { - continue; - } - DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), - DVI->getExpression(), DVI->getDebugLoc(), - Inst); - } - } -}; -} // end anon namespace - -namespace { -/// \brief An optimization pass providing Scalar Replacement of Aggregates. -/// -/// This pass takes allocations which can be completely analyzed (that is, they -/// don't escape) and tries to turn them into scalar SSA values. There are -/// a few steps to this process. -/// -/// 1) It takes allocations of aggregates and analyzes the ways in which they -/// are used to try to split them into smaller allocations, ideally of -/// a single scalar data type. It will split up memcpy and memset accesses -/// as necessary and try to isolate individual scalar accesses. -/// 2) It will transform accesses into forms which are suitable for SSA value -/// promotion. This can be replacing a memset with a scalar store of an -/// integer value, or it can involve speculating operations on a PHI or -/// select to be a PHI or select of the results. -/// 3) Finally, this will try to detect a pattern of accesses which map cleanly -/// onto insert and extract operations on a vector value, and convert them to -/// this form. By doing so, it will enable promotion of vector aggregates to -/// SSA vector values. -class SROA : public FunctionPass { - const bool RequiresDomTree; - - LLVMContext *C; - DominatorTree *DT; - AssumptionCache *AC; - - /// \brief Worklist of alloca instructions to simplify. - /// - /// Each alloca in the function is added to this. Each new alloca formed gets - /// added to it as well to recursively simplify unless that alloca can be - /// directly promoted. Finally, each time we rewrite a use of an alloca other - /// the one being actively rewritten, we add it back onto the list if not - /// already present to ensure it is re-visited. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; - - /// \brief A collection of instructions to delete. - /// We try to batch deletions to simplify code and make things a bit more - /// efficient. - SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; - - /// \brief Post-promotion worklist. - /// - /// Sometimes we discover an alloca which has a high probability of becoming - /// viable for SROA after a round of promotion takes place. In those cases, - /// the alloca is enqueued here for re-processing. - /// - /// Note that we have to be very careful to clear allocas out of this list in - /// the event they are deleted. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; - - /// \brief A collection of alloca instructions we can directly promote. - std::vector<AllocaInst *> PromotableAllocas; - - /// \brief A worklist of PHIs to speculate prior to promoting allocas. - /// - /// All of these PHIs have been checked for the safety of speculation and by - /// being speculated will allow promoting allocas currently in the promotable - /// queue. - SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; - - /// \brief A worklist of select instructions to speculate prior to promoting - /// allocas. - /// - /// All of these select instructions have been checked for the safety of - /// speculation and by being speculated will allow promoting allocas - /// currently in the promotable queue. - SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; - -public: - SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), - DT(nullptr) { - initializeSROAPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - - const char *getPassName() const override { return "SROA"; } - static char ID; - -private: - friend class PHIOrSelectSpeculator; - friend class AllocaSliceRewriter; - - bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); - AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P); - bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); - bool runOnAlloca(AllocaInst &AI); - void clobberUse(Use &U); - void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); - bool promoteAllocas(Function &F); -}; -} - -char SROA::ID = 0; - -FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { - return new SROA(RequiresDomTree); -} - -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, - false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, - false) - /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. static Type *findCommonType(AllocaSlices::const_iterator B, @@ -1373,7 +1158,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // Ensure that there are no instructions between the PHI and the load that // could store. - for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; @@ -1934,10 +1719,10 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// \brief Test whether the given slice use can be promoted to a vector. /// -/// This function is called to test each entry in a partioning which is slated +/// This function is called to test each entry in a partition which is slated /// for a single slice. -static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, - const Slice &S, VectorType *Ty, +static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, + VectorType *Ty, uint64_t ElementSize, const DataLayout &DL) { // First validate the slice offsets. @@ -2012,8 +1797,7 @@ static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, - const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector<VectorType *, 4> CandidateTys; @@ -2130,7 +1914,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t RelEnd = S.endOffset() - AllocBeginOffset; // We can't reasonably handle cases where the load or store extends past - // the end of the aloca's type and into its padding. + // the end of the alloca's type and into its padding. if (RelEnd > Size) return false; @@ -2199,7 +1983,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, +static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. @@ -2368,14 +2152,14 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, return V; } -namespace { /// \brief Visitor to rewrite instructions using p particular slice of an alloca /// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition /// passes the isVectorPromotionViable predicate. Most of the rewriting logic /// lives here. -class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { +class llvm::sroa::AllocaSliceRewriter + : public InstVisitor<AllocaSliceRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; @@ -2583,9 +2367,19 @@ private: V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; - if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) - V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, - "extract"); + if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { + IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); + V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); + } + // It is possible that the extracted type is not the load type. This + // happens if there is a load past the end of the alloca, and as + // a consequence the slice is narrower but still a candidate for integer + // lowering. To handle this case, we just zero extend the extracted + // integer. + assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && + "Can only handle an extract for an overly wide load"); + if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) + V = IRB.CreateZExt(V, LI.getType()); return V; } @@ -2648,7 +2442,7 @@ private: DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); + IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving @@ -3126,7 +2920,7 @@ private: // dominate the PHI. IRBuilderTy PtrBuilder(IRB); if (isa<PHINode>(OldPtr)) - PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt()); + PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt()); else PtrBuilder.SetInsertPoint(OldPtr); PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); @@ -3169,7 +2963,6 @@ private: return true; } }; -} namespace { /// \brief Visitor to rewrite aggregate loads and stores as scalar. @@ -3181,8 +2974,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; - const DataLayout &DL; - /// Queue of pointer uses to analyze and potentially rewrite. SmallVector<Use *, 8> Queue; @@ -3194,8 +2985,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { Use *U; public: - AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} - /// Rewrite loads and stores through a pointer and all pointers derived from /// it. bool rewrite(Instruction &I) { @@ -3711,7 +3500,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { return true; }), Stores.end()); - // Now we have to go *back* through all te stores, because a later store may + // Now we have to go *back* through all the stores, because a later store may // have caused an earlier store's load to become unsplittable and if it is // unsplittable for the later store, then we can't rely on it being split in // the earlier store either. @@ -3773,7 +3562,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { "Cannot represent alloca access size using 64-bit integers!"); Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); - IRB.SetInsertPoint(BasicBlock::iterator(LI)); + IRB.SetInsertPoint(LI); DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); @@ -3825,7 +3614,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } Value *StoreBasePtr = SI->getPointerOperand(); - IRB.SetInsertPoint(BasicBlock::iterator(SI)); + IRB.SetInsertPoint(SI); DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); @@ -3914,7 +3703,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { if (SplitLoads) { PLoad = (*SplitLoads)[Idx]; } else { - IRB.SetInsertPoint(BasicBlock::iterator(LI)); + IRB.SetInsertPoint(LI); PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3924,7 +3713,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } // And store this partition. - IRB.SetInsertPoint(BasicBlock::iterator(SI)); + IRB.SetInsertPoint(SI); StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3972,7 +3761,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // Mark the original store as dead now that we've split it up and kill its // slice. Note that we leave the original load in place unless this store - // was its ownly use. It may in turn be split up if it is an alloca load + // was its only use. It may in turn be split up if it is an alloca load // for some other alloca, but it may be a normal load. This may introduce // redundant loads, but where those can be merged the rest of the optimizer // should handle the merging, and this uncovers SSA splits which is more @@ -4024,7 +3813,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { /// at enabling promotion and if it was successful queues the alloca to be /// promoted. AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P) { + Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. @@ -4230,12 +4019,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); // Migrate debug information from the old alloca to the new alloca(s) - // and the individial partitions. + // and the individual partitions. if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { auto *Var = DbgDecl->getVariable(); auto *Expr = DbgDecl->getExpression(); - DIBuilder DIB(*AI.getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); bool IsSplit = Pieces.size() > 1; for (auto Piece : Pieces) { // Create a piece expression describing the new partition or reuse AI's @@ -4308,7 +4096,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(DL); + AggLoadStoreRewriter AggRewriter; Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. @@ -4388,107 +4176,29 @@ void SROA::deleteDeadInstructions( } } -static void enqueueUsersInWorklist(Instruction &I, - SmallVectorImpl<Instruction *> &Worklist, - SmallPtrSetImpl<Instruction *> &Visited) { - for (User *U : I.users()) - if (Visited.insert(cast<Instruction>(U)).second) - Worklist.push_back(cast<Instruction>(U)); -} - /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. -/// If there is a domtree available, we attempt to promote using the full power -/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is -/// based on the SSAUpdater utilities. This function returns whether any -/// promotion occurred. +/// This function returns whether any promotion occurred. bool SROA::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; NumPromoted += PromotableAllocas.size(); - if (DT && !ForceSSAUpdater) { - DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); - PromotableAllocas.clear(); - return true; - } - - DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); - SSAUpdater SSA; - DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); - SmallVector<Instruction *, 64> Insts; - - // We need a worklist to walk the uses of each alloca. - SmallVector<Instruction *, 8> Worklist; - SmallPtrSet<Instruction *, 8> Visited; - SmallVector<Instruction *, 32> DeadInsts; - - for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { - AllocaInst *AI = PromotableAllocas[Idx]; - Insts.clear(); - Worklist.clear(); - Visited.clear(); - - enqueueUsersInWorklist(*AI, Worklist, Visited); - - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - - // FIXME: Currently the SSAUpdater infrastructure doesn't reason about - // lifetime intrinsics and so we strip them (and the bitcasts+GEPs - // leading to them) here. Eventually it should use them to optimize the - // scalar values produced. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - assert(II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end); - II->eraseFromParent(); - continue; - } - - // Push the loads and stores we find onto the list. SROA will already - // have validated that all loads and stores are viable candidates for - // promotion. - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - assert(LI->getType() == AI->getAllocatedType()); - Insts.push_back(LI); - continue; - } - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); - Insts.push_back(SI); - continue; - } - - // For everything else, we know that only no-op bitcasts and GEPs will - // make it this far, just recurse through them and recall them for later - // removal. - DeadInsts.push_back(I); - enqueueUsersInWorklist(*I, Worklist, Visited); - } - AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - while (!DeadInsts.empty()) - DeadInsts.pop_back_val()->eraseFromParent(); - AI->eraseFromParent(); - } - + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } -bool SROA::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - +PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, + AssumptionCache &RunAC) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DT = &RunDT; + AC = &RunAC; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4527,12 +4237,55 @@ bool SROA::runOnFunction(Function &F) { PostPromotionWorklist.clear(); } while (!Worklist.empty()); - return Changed; + // FIXME: Even when promoting allocas we should preserve some abstract set of + // CFG-specific analyses. + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -void SROA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionCacheTracker>(); - if (RequiresDomTree) - AU.addRequired<DominatorTreeWrapperPass>(); - AU.setPreservesCFG(); +PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) { + return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F), + AM->getResult<AssumptionAnalysis>(F)); } + +/// A legacy pass for the legacy pass manager that wraps the \c SROA pass. +/// +/// This is in the llvm namespace purely to allow it to be a friend of the \c +/// SROA pass. +class llvm::sroa::SROALegacyPass : public FunctionPass { + /// The SROA implementation. + SROA Impl; + +public: + SROALegacyPass() : FunctionPass(ID) { + initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + auto PA = Impl.runImpl( + F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); + return !PA.areAllPreserved(); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.setPreservesCFG(); + } + + const char *getPassName() const override { return "SROA"; } + static char ID; +}; + +char SROALegacyPass::ID = 0; + +FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); } + +INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", + "Scalar Replacement Of Aggregates", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", + false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index d5d360571f88..52d477cc9573 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -16,7 +16,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Scalar.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" @@ -27,10 +30,9 @@ using namespace llvm; /// initializeScalarOptsPasses - Initialize all passes linked into the /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { - initializeADCEPass(Registry); + initializeADCELegacyPassPass(Registry); initializeBDCEPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); - initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); @@ -66,7 +68,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRewriteStatepointsForGCPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); - initializeSROAPass(Registry); + initializeSROALegacyPassPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); @@ -81,6 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializePlaceSafepointsPass(Registry); initializeFloat2IntPass(Registry); initializeLoopDistributePass(Registry); + initializeLoopLoadEliminationPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -225,15 +228,15 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { } void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); + unwrap(PM)->add(createTypeBasedAAWrapperPass()); } void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createScopedNoAliasAAPass()); + unwrap(PM)->add(createScopedNoAliasAAWrapperPass()); } void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createBasicAliasAnalysisPass()); + unwrap(PM)->add(createBasicAAWrapperPass()); } void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index d955da7ce75d..114d22ddf2e4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -60,6 +60,7 @@ STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); namespace { +#define SROA SROA_ struct SROA : public FunctionPass { SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { @@ -382,8 +383,8 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } - AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", - AI->getParent()->begin()); + AllocaInst *NewAI = + new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front()); ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } @@ -1195,7 +1196,7 @@ static bool isSafePHIToSpeculate(PHINode *PN) { // Ensure that there are no instructions between the PHI and the load that // could store. - for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) + for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 049300350857..054bacdc706b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -253,10 +253,10 @@ bool Scalarizer::doInitialization(Module &M) { } bool Scalarizer::runOnFunction(Function &F) { - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { - BasicBlock *BB = BBI; - for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { - Instruction *I = II; + assert(Gathered.empty() && Scattered.empty()); + for (BasicBlock &BB : F) { + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II; bool Done = visit(I); ++II; if (Done && I->getType()->isVoidTy()) @@ -285,7 +285,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { } // In the fallback case, just put the scattered before Point and // keep the result local to Point. - return Scatterer(Point->getParent(), Point, V); + return Scatterer(Point->getParent(), Point->getIterator(), V); } // Replace Op with the gathered form of the components in CV. Defer the @@ -377,7 +377,7 @@ bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(I.getParent(), &I); + IRBuilder<> Builder(&I); Scatterer Op0 = scatter(&I, I.getOperand(0)); Scatterer Op1 = scatter(&I, I.getOperand(1)); assert(Op0.size() == NumElems && "Mismatched binary operation"); @@ -397,7 +397,7 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(SI.getParent(), &SI); + IRBuilder<> Builder(&SI); Scatterer Op1 = scatter(&SI, SI.getOperand(1)); Scatterer Op2 = scatter(&SI, SI.getOperand(2)); assert(Op1.size() == NumElems && "Mismatched select"); @@ -438,7 +438,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { if (!VT) return false; - IRBuilder<> Builder(GEPI.getParent(), &GEPI); + IRBuilder<> Builder(&GEPI); unsigned NumElems = VT->getNumElements(); unsigned NumIndices = GEPI.getNumIndices(); @@ -472,7 +472,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(CI.getParent(), &CI); + IRBuilder<> Builder(&CI); Scatterer Op0 = scatter(&CI, CI.getOperand(0)); assert(Op0.size() == NumElems && "Mismatched cast"); ValueVector Res; @@ -492,7 +492,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { unsigned DstNumElems = DstVT->getNumElements(); unsigned SrcNumElems = SrcVT->getNumElements(); - IRBuilder<> Builder(BCI.getParent(), &BCI); + IRBuilder<> Builder(&BCI); Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); ValueVector Res; Res.resize(DstNumElems); @@ -569,7 +569,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(PHI.getParent(), &PHI); + IRBuilder<> Builder(&PHI); ValueVector Res; Res.resize(NumElems); @@ -600,7 +600,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) { return false; unsigned NumElems = Layout.VecTy->getNumElements(); - IRBuilder<> Builder(LI.getParent(), &LI); + IRBuilder<> Builder(&LI); Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); ValueVector Res; Res.resize(NumElems); @@ -625,7 +625,7 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { return false; unsigned NumElems = Layout.VecTy->getNumElements(); - IRBuilder<> Builder(SI.getParent(), &SI); + IRBuilder<> Builder(&SI); Scatterer Ptr = scatter(&SI, SI.getPointerOperand()); Scatterer Val = scatter(&SI, FullValue); @@ -642,7 +642,9 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { // Delete the instructions that we scalarized. If a full vector result // is still needed, recreate it using InsertElements. bool Scalarizer::finish() { - if (Gathered.empty()) + // The presence of data in Gathered or Scattered indicates changes + // made to the Function. + if (Gathered.empty() && Scattered.empty()) return false; for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); GMI != GME; ++GMI) { @@ -655,7 +657,7 @@ bool Scalarizer::finish() { Value *Res = UndefValue::get(Ty); BasicBlock *BB = Op->getParent(); unsigned Count = Ty->getVectorNumElements(); - IRBuilder<> Builder(BB, Op); + IRBuilder<> Builder(Op); if (isa<PHINode>(Op)) Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); for (unsigned I = 0; I < Count; ++I) diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 4a875311881a..86a10d2a1612 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,6 +156,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -164,6 +168,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Operator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" @@ -174,6 +179,7 @@ #include "llvm/IR/IRBuilder.h" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<bool> DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), @@ -319,8 +325,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } bool doInitialization(Module &M) override { @@ -373,15 +382,42 @@ private: /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. + /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting + /// the constant offset. After extraction, it becomes desirable to reunion the + /// distributed sexts. For example, + /// + /// &a[sext(i +nsw (j +nsw 5)] + /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] + /// => constant extraction &a[sext(i) + sext(j)] + 5 + /// => reunion &a[sext(i +nsw j)] + 5 + bool reuniteExts(Function &F); + /// A helper that reunites sexts in an instruction. + bool reuniteExts(Instruction *I); + /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. + Instruction *findClosestMatchingDominator(const SCEV *Key, + Instruction *Dominatee); /// Verify F is free of dead code. void verifyNoDeadCode(Function &F); + bool hasMoreThanOneUseInLoop(Value *v, Loop *L); + // Swap the index operand of two GEP. + void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); + // Check if it is safe to swap operand of two GEP. + bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, + Loop *CurLoop); + const DataLayout *DL; - const DominatorTree *DT; + DominatorTree *DT; + ScalarEvolution *SE; const TargetMachine *TM; + + LoopInfo *LI; + TargetLibraryInfo *TLI; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. bool LowerGEP; + DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs; }; } // anonymous namespace @@ -391,7 +427,10 @@ INITIALIZE_PASS_BEGIN( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, @@ -734,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Type *I8PtrTy = Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); Value *ResultPtr = Variadic->getOperand(0); + Loop *L = LI->getLoopFor(Variadic->getParent()); + // Check if the base is not loop invariant or used more than once. + bool isSwapCandidate = + L && L->isLoopInvariant(ResultPtr) && + !hasMoreThanOneUseInLoop(ResultPtr, L); + Value *FirstResult = nullptr; + if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); @@ -762,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( // Create an ugly GEP with a single index for each index. ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); + if (FirstResult == nullptr) + FirstResult = ResultPtr; } } @@ -770,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); - } + } else + isSwapCandidate = false; + + // If we created a GEP with constant index, and the base is loop invariant, + // then we swap the first one with it, so LICM can move constant GEP out + // later. + GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult); + GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); + if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) + swapGEPOperand(FirstGEP, SecondGEP); + if (ResultPtr->getType() != Variadic->getType()) ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); @@ -891,13 +949,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Clear the inbounds attribute because the new index may be off-bound. // e.g., // - // b = add i64 a, 5 - // addr = gep inbounds float* p, i64 b + // b = add i64 a, 5 + // addr = gep inbounds float, float* p, i64 b // // is transformed to: // - // addr2 = gep float* p, i64 a - // addr = gep float* addr2, i64 5 + // addr2 = gep float, float* p, i64 a ; inbounds removed + // addr = gep inbounds float, float* addr2, i64 5 // // If a is -4, although the old index b is in bounds, the new index a is // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the @@ -907,6 +965,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // // TODO(jingyue): do some range analysis to keep as many inbounds as // possible. GEPs with inbounds are more friendly to alias analysis. + bool GEPWasInBounds = GEP->isInBounds(); GEP->setIsInBounds(false); // Lowers a GEP to either GEPs with a single index or arithmetic operations. @@ -968,6 +1027,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + // Inherit the inbounds attribute of the original GEP. + cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); } else { // Unlikely but possible. For example, // #pragma pack(1) @@ -990,6 +1051,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Type::getInt8Ty(GEP->getContext()), NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", GEP); + // Inherit the inbounds attribute of the original GEP. + cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); if (GEP->getType() != I8PtrTy) NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); } @@ -1008,24 +1071,96 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { return false; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) Changed |= splitGEP(GEP); - } - // No need to split GEP ConstantExprs because all its indices are constant - // already. - } + // No need to split GEP ConstantExprs because all its indices are constant + // already. } + Changed |= reuniteExts(F); + if (VerifyNoDeadCode) verifyNoDeadCode(F); return Changed; } +Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( + const SCEV *Key, Instruction *Dominatee) { + auto Pos = DominatingExprs.find(Key); + if (Pos == DominatingExprs.end()) + return nullptr; + + auto &Candidates = Pos->second; + // Because we process the basic blocks in pre-order of the dominator tree, a + // candidate that doesn't dominate the current instruction won't dominate any + // future instruction either. Therefore, we pop it out of the stack. This + // optimization makes the algorithm O(n). + while (!Candidates.empty()) { + Instruction *Candidate = Candidates.back(); + if (DT->dominates(Candidate, Dominatee)) + return Candidate; + Candidates.pop_back(); + } + return nullptr; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { + if (!SE->isSCEVable(I->getType())) + return false; + + // Dom: LHS+RHS + // I: sext(LHS)+sext(RHS) + // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). + // TODO: handle zext + Value *LHS = nullptr, *RHS = nullptr; + if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) || + match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { + if (LHS->getType() == RHS->getType()) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + if (auto *Dom = findClosestMatchingDominator(Key, I)) { + Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + NewSExt->takeName(I); + I->replaceAllUsesWith(NewSExt); + RecursivelyDeleteTriviallyDeadInstructions(I); + return true; + } + } + } + + // Add I to DominatingExprs if it's an add/sub that can't sign overflow. + if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || + match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { + if (isKnownNotFullPoison(I)) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + DominatingExprs[Key].push_back(I); + } + } + return false; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { + bool Changed = false; + DominatingExprs.clear(); + for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT); + Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { + BasicBlock *BB = Node->getBlock(); + for (auto I = BB->begin(); I != BB->end(); ) { + Instruction *Cur = &*I++; + Changed |= reuniteExts(Cur); + } + } + return Changed; +} + void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { for (auto &B : F) { for (auto &I : B) { @@ -1038,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { } } } + +bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( + GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { + if (!FirstGEP || !FirstGEP->hasOneUse()) + return false; + + if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) + return false; + + if (FirstGEP == SecondGEP) + return false; + + unsigned FirstNum = FirstGEP->getNumOperands(); + unsigned SecondNum = SecondGEP->getNumOperands(); + // Give up if the number of operands are not 2. + if (FirstNum != SecondNum || FirstNum != 2) + return false; + + Value *FirstBase = FirstGEP->getOperand(0); + Value *SecondBase = SecondGEP->getOperand(0); + Value *FirstOffset = FirstGEP->getOperand(1); + // Give up if the index of the first GEP is loop invariant. + if (CurLoop->isLoopInvariant(FirstOffset)) + return false; + + // Give up if base doesn't have same type. + if (FirstBase->getType() != SecondBase->getType()) + return false; + + Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); + + // Check if the second operand of first GEP has constant coefficient. + // For an example, for the following code, we won't gain anything by + // hoisting the second GEP out because the second GEP can be folded away. + // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 + // %67 = shl i64 %scevgep.sum.ur159, 2 + // %uglygep160 = getelementptr i8* %65, i64 %67 + // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 + + // Skip constant shift instruction which may be generated by Splitting GEPs. + if (FirstOffsetDef && FirstOffsetDef->isShift() && + isa<ConstantInt>(FirstOffsetDef->getOperand(1))) + FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); + + // Give up if FirstOffsetDef is an Add or Sub with constant. + // Because it may not profitable at all due to constant folding. + if (FirstOffsetDef) + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { + unsigned opc = BO->getOpcode(); + if ((opc == Instruction::Add || opc == Instruction::Sub) && + (isa<ConstantInt>(BO->getOperand(0)) || + isa<ConstantInt>(BO->getOperand(1)))) + return false; + } + return true; +} + +bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { + int UsesInLoop = 0; + for (User *U : V->users()) { + if (Instruction *User = dyn_cast<Instruction>(U)) + if (L->contains(User)) + if (++UsesInLoop > 1) + return true; + } + return false; +} + +void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, + GetElementPtrInst *Second) { + Value *Offset1 = First->getOperand(1); + Value *Offset2 = Second->getOperand(1); + First->setOperand(1, Offset2); + Second->setOperand(1, Offset1); + + // We changed p+o+c to p+c+o, p+c may not be inbound anymore. + const DataLayout &DAL = First->getModule()->getDataLayout(); + APInt Offset(DAL.getPointerSizeInBits( + cast<PointerType>(First->getType())->getAddressSpace()), + 0); + Value *NewBase = + First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); + uint64_t ObjectSize; + if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || + Offset.ugt(ObjectSize)) { + First->setIsInBounds(false); + Second->setIsInBounds(false); + } else + First->setIsInBounds(true); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 231411a16c05..63c8836bf381 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" @@ -67,15 +68,14 @@ static bool mergeEmptyReturnBlocks(Function &F) { // single PHI node that is the operand to the return. if (Ret != &BB.front()) { // Check for something else in the block. - BasicBlock::iterator I = Ret; + BasicBlock::iterator I(Ret); --I; // Skip over debug info. while (isa<DbgInfoIntrinsic>(I) && I != BB.begin()) --I; if (!isa<DbgInfoIntrinsic>(I) && - (!isa<PHINode>(I) || I != BB.begin() || - Ret->getNumOperands() == 0 || - Ret->getOperand(0) != I)) + (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 || + Ret->getOperand(0) != &*I)) continue; } @@ -136,7 +136,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded. for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) { + if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) { LocalChange = true; ++NumSimpl; } @@ -217,6 +217,7 @@ struct CFGSimplifyPass : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index f49f4eaaedcb..64109b2df117 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -48,7 +48,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -66,7 +66,7 @@ char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) FunctionPass *llvm::createSinkingPass() { return new Sinking(); } @@ -99,7 +99,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool MadeChange, EverMadeChange = false; @@ -119,7 +119,7 @@ bool Sinking::runOnFunction(Function &F) { bool Sinking::ProcessBlock(BasicBlock &BB) { // Can't sink anything out of a block that has less than two successors. - if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; + if (BB.getTerminator()->getNumSuccessors() <= 1) return false; // Don't bother sinking code out of unreachable blocks. In addition to being // unprofitable, it can also lead to infinite looping, because in an @@ -134,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { bool ProcessedBegin = false; SmallPtrSet<Instruction *, 8> Stores; do { - Instruction *Inst = I; // The instruction to sink. + Instruction *Inst = &*I; // The instruction to sink. // Predecrement I (if it's not begin) so that it isn't invalidated by // sinking. @@ -165,14 +165,16 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { MemoryLocation Loc = MemoryLocation::get(L); for (Instruction *S : Stores) - if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod) + if (AA->getModRefInfo(S, Loc) & MRI_Mod) return false; } - if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) + if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() || + Inst->mayThrow()) return false; - // Convergent operations can only be moved to control equivalent blocks. + // Convergent operations cannot be made control-dependent on additional + // values. if (auto CS = CallSite(Inst)) { if (CS.hasFnAttr(Attribute::Convergent)) return false; @@ -193,6 +195,11 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (Inst->getParent() == SuccToSinkTo) return false; + // It's never legal to sink an instruction into a block which terminates in an + // EH-pad. + if (SuccToSinkTo->getTerminator()->isExceptional()) + return false; + // If the block has multiple predecessors, this would introduce computation // on different code paths. We could split the critical edge, but for now we // just punt. @@ -278,6 +285,6 @@ bool Sinking::SinkInstruction(Instruction *Inst, dbgs() << ")\n"); // Move the instruction. - Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); + Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt()); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ff3f00a2e2f8..147d615488ff 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -227,7 +227,7 @@ bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock, // changes the list that I is iterating through. auto Current = I; ++I; - if (!NotHoisted.count(Current)) { + if (!NotHoisted.count(&*Current)) { Current->moveBefore(ToBlock.getTerminator()); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 6d9d417ef943..1faa65eb3417 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -131,7 +131,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); // We do not modify the shape of the CFG. AU.setPreservesCFG(); @@ -212,7 +212,7 @@ char StraightLineStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) @@ -234,6 +234,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, Basis.CandidateKind == C.CandidateKind); } +// TODO: use TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, const TargetTransformInfo *TTI, const DataLayout *DL) { @@ -523,7 +524,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( continue; const SCEV *OrigIndexExpr = IndexExprs[I - 1]; - IndexExprs[I - 1] = SE->getConstant(OrigIndexExpr->getType(), 0); + IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType()); // The base of this candidate is GEP's base plus the offsets of all // indices except this current one. @@ -689,7 +690,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Traverse the dominator tree in the depth-first order. This order makes sure // all bases of a candidate are in Candidates when we process it. for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 4f23e20d251d..662513c7d8ae 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -358,13 +358,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { BasicBlock *BB = N->getNodeAs<BasicBlock>(); BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = Term->getSuccessor(i); - - if (Visited.count(Succ)) { + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) Loops[Succ] = BB; - } - } } } @@ -903,14 +899,14 @@ void StructurizeCFG::rebuildSSA() { continue; } - if (DT->dominates(II, User)) + if (DT->dominates(&*II, User)) continue; if (!Initialized) { Value *Undef = UndefValue::get(II->getType()); Updater.Initialize(II->getType(), ""); Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); - Updater.AddAvailableValue(BB, II); + Updater.AddAvailableValue(BB, &*II); Initialized = true; } Updater.RewriteUseAfterInsertions(U); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index c7de2e2965c7..0e0b00df85bb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" @@ -136,6 +137,7 @@ FunctionPass *llvm::createTailCallEliminationPass() { void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } /// \brief Scan the specified function for alloca instructions. @@ -195,8 +197,8 @@ struct AllocaDerivedValueTracker { case Instruction::Call: case Instruction::Invoke: { CallSite CS(I); - bool IsNocapture = !CS.isCallee(U) && - CS.doesNotCapture(CS.getArgumentNo(U)); + bool IsNocapture = + CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U)); callUsesLocalStack(CS, IsNocapture); if (IsNocapture) { // If the alloca-derived argument is passed in as nocapture, then it @@ -302,7 +304,9 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { if (!CI || CI->isTailCall()) continue; - if (CI->doesNotAccessMemory()) { + bool IsNoTail = CI->isNoTailCall(); + + if (!IsNoTail && CI->doesNotAccessMemory()) { // A call to a readnone function whose arguments are all things computed // outside this function can be marked tail. Even if you stored the // alloca address into a global, a readnone function can't load the @@ -330,7 +334,7 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { } } - if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { DeferredTails.push_back(CI); } else { AllCallsAreTailCalls = false; @@ -404,7 +408,7 @@ bool TailCallElim::runTRE(Function &F) { // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { - BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB. + BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); @@ -574,7 +578,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI = nullptr; - BasicBlock::iterator BBI = TI; + BasicBlock::iterator BBI(TI); while (true) { CI = dyn_cast<CallInst>(BBI); if (CI && CI->getCalledFunction() == F) @@ -595,9 +599,8 @@ TailCallElim::FindTRECandidate(Instruction *TI, // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && - FirstNonDbg(BB->front()) == CI && - FirstNonDbg(std::next(BB->begin())) == TI && - CI->getCalledFunction() && + FirstNonDbg(BB->front().getIterator()) == CI && + FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && !TTI->isLoweredToCall(CI->getCalledFunction())) { // A single-block function with just a call and a return. Check that // the arguments match. @@ -636,19 +639,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // tail call if all of the instructions between the call and the return are // movable to above the call itself, leaving the call next to the return. // Check that this is the case now. - BasicBlock::iterator BBI = CI; + BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (CanMoveAboveCall(BBI, CI)) continue; + if (CanMoveAboveCall(&*BBI, CI)) continue; // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the // case, and if so, remember the initial accumulator value for later. if ((AccumulatorRecursionEliminationInitVal = - CanTransformAccumulatorRecursion(BBI, CI))) { + CanTransformAccumulatorRecursion(&*BBI, CI))) { // Yes, this is accumulator recursion. Remember which instruction // accumulates. - AccumulatorRecursionInstr = BBI; + AccumulatorRecursionInstr = &*BBI; } else { return false; // Otherwise, we cannot eliminate the tail recursion! } @@ -698,19 +701,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, NEBI = NewEntry->begin(); OEBI != E; ) if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) if (isa<ConstantInt>(AI->getArraySize())) - AI->moveBefore(NEBI); + AI->moveBefore(&*NEBI); // Now that we have created a new block, which jumps to the entry // block, insert a PHI node for each argument of the function. // For now, we initialize each PHI to only have the real arguments // which are passed in. - Instruction *InsertPos = OldEntry->begin(); + Instruction *InsertPos = &OldEntry->front(); for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) { PHINode *PN = PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos); I->replaceAllUsesWith(PN); // Everyone use the PHI node now! - PN->addIncoming(I, NewEntry); + PN->addIncoming(&*I, NewEntry); ArgumentPHIs.push_back(PN); } } @@ -739,10 +742,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, Instruction *AccRecInstr = AccumulatorRecursionInstr; // Start by inserting a new PHI node for the accumulator. pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry); - PHINode *AccPN = - PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(), - std::distance(PB, PE) + 1, - "accumulator.tr", OldEntry->begin()); + PHINode *AccPN = PHINode::Create( + AccumulatorRecursionEliminationInitVal->getType(), + std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front()); // Loop over all of the predecessors of the tail recursion block. For the // real entry into the function we seed the PHI with the initial value, diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp index 03c3a80170a3..409326eba401 100644 --- a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/ADT/SmallString.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include <algorithm> namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index e9f62391a44f..0262358fa3d5 100644 --- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -52,32 +52,34 @@ // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "add-discriminators" namespace { - struct AddDiscriminators : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - AddDiscriminators() : FunctionPass(ID) { - initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); - } +struct AddDiscriminators : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + AddDiscriminators() : FunctionPass(ID) { + initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override; - }; + bool runOnFunction(Function &F) override; +}; } char AddDiscriminators::ID = 0; @@ -89,17 +91,17 @@ INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators", // Command line option to disable discriminator generation even in the // presence of debug information. This is only needed when debugging // debug info generation issues. -static cl::opt<bool> -NoDiscriminators("no-discriminators", cl::init(false), - cl::desc("Disable generation of discriminator information.")); +static cl::opt<bool> NoDiscriminators( + "no-discriminators", cl::init(false), + cl::desc("Disable generation of discriminator information.")); FunctionPass *llvm::createAddDiscriminatorsPass() { return new AddDiscriminators(); } static bool hasDebugInfo(const Function &F) { - NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); - return CUNodes != nullptr; + DISubprogram *S = getDISubprogram(&F); + return S != nullptr; } /// \brief Assign DWARF discriminators. @@ -159,8 +161,7 @@ bool AddDiscriminators::runOnFunction(Function &F) { // Simlarly, if the function has no debug info, do nothing. // Finally, if this module is built with dwarf versions earlier than 4, // do nothing (discriminator support is a DWARF 4 feature). - if (NoDiscriminators || - !hasDebugInfo(F) || + if (NoDiscriminators || !hasDebugInfo(F) || F.getParent()->getDwarfVersion() < 4) return false; @@ -169,59 +170,77 @@ bool AddDiscriminators::runOnFunction(Function &F) { LLVMContext &Ctx = M->getContext(); DIBuilder Builder(*M, /*AllowUnresolved*/ false); - // Traverse all the blocks looking for instructions in different - // blocks that are at the same file:line location. - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *B = I; - TerminatorInst *Last = B->getTerminator(); - const DILocation *LastDIL = Last->getDebugLoc(); - if (!LastDIL) - continue; - - for (unsigned I = 0; I < Last->getNumSuccessors(); ++I) { - BasicBlock *Succ = Last->getSuccessor(I); - Instruction *First = Succ->getFirstNonPHIOrDbgOrLifetime(); - const DILocation *FirstDIL = First->getDebugLoc(); - if (!FirstDIL) + typedef std::pair<StringRef, unsigned> Location; + typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap; + typedef DenseMap<Location, BBScopeMap> LocationBBMap; + + LocationBBMap LBM; + + // Traverse all instructions in the function. If the source line location + // of the instruction appears in other basic block, assign a new + // discriminator for this instruction. + for (BasicBlock &B : F) { + for (auto &I : B.getInstList()) { + if (isa<DbgInfoIntrinsic>(&I)) + continue; + const DILocation *DIL = I.getDebugLoc(); + if (!DIL) + continue; + Location L = std::make_pair(DIL->getFilename(), DIL->getLine()); + auto &BBMap = LBM[L]; + auto R = BBMap.insert(std::make_pair(&B, (Metadata *)nullptr)); + if (BBMap.size() == 1) + continue; + bool InsertSuccess = R.second; + Metadata *&NewScope = R.first->second; + // If we could insert a different block in the same location, a + // discriminator is needed to distinguish both instructions. + if (InsertSuccess) { + auto *Scope = DIL->getScope(); + auto *File = + Builder.createFile(DIL->getFilename(), Scope->getDirectory()); + NewScope = Builder.createLexicalBlockFile( + Scope, File, DIL->computeNewDiscriminator()); + } + I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(), + NewScope, DIL->getInlinedAt())); + DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":" + << DIL->getColumn() << ":" + << dyn_cast<DILexicalBlockFile>(NewScope)->getDiscriminator() + << I << "\n"); + Changed = true; + } + } + + // Traverse all instructions and assign new discriminators to call + // instructions with the same lineno that are in the same basic block. + // Sample base profile needs to distinguish different function calls within + // a same source line for correct profile annotation. + for (BasicBlock &B : F) { + const DILocation *FirstDIL = NULL; + for (auto &I : B.getInstList()) { + CallInst *Current = dyn_cast<CallInst>(&I); + if (!Current || isa<DbgInfoIntrinsic>(&I)) continue; - // If the first instruction (First) of Succ is at the same file - // location as B's last instruction (Last), add a new - // discriminator for First's location and all the instructions - // in Succ that share the same location with First. - if (!FirstDIL->canDiscriminate(*LastDIL)) { - // Create a new lexical scope and compute a new discriminator - // number for it. - StringRef Filename = FirstDIL->getFilename(); - auto *Scope = FirstDIL->getScope(); - auto *File = Builder.createFile(Filename, Scope->getDirectory()); - - // FIXME: Calculate the discriminator here, based on local information, - // and delete DILocation::computeNewDiscriminator(). The current - // solution gives different results depending on other modules in the - // same context. All we really need is to discriminate between - // FirstDIL and LastDIL -- a local map would suffice. - unsigned Discriminator = FirstDIL->computeNewDiscriminator(); - auto *NewScope = - Builder.createLexicalBlockFile(Scope, File, Discriminator); - auto *NewDIL = - DILocation::get(Ctx, FirstDIL->getLine(), FirstDIL->getColumn(), - NewScope, FirstDIL->getInlinedAt()); - DebugLoc newDebugLoc = NewDIL; - - // Attach this new debug location to First and every - // instruction following First that shares the same location. - for (BasicBlock::iterator I1(*First), E1 = Succ->end(); I1 != E1; - ++I1) { - if (I1->getDebugLoc().get() != FirstDIL) - break; - I1->setDebugLoc(newDebugLoc); - DEBUG(dbgs() << NewDIL->getFilename() << ":" << NewDIL->getLine() - << ":" << NewDIL->getColumn() << ":" - << NewDIL->getDiscriminator() << *I1 << "\n"); + DILocation *CurrentDIL = Current->getDebugLoc(); + if (FirstDIL) { + if (CurrentDIL && CurrentDIL->getLine() == FirstDIL->getLine() && + CurrentDIL->getFilename() == FirstDIL->getFilename()) { + auto *Scope = FirstDIL->getScope(); + auto *File = Builder.createFile(FirstDIL->getFilename(), + Scope->getDirectory()); + auto *NewScope = Builder.createLexicalBlockFile( + Scope, File, FirstDIL->computeNewDiscriminator()); + Current->setDebugLoc(DILocation::get( + Ctx, CurrentDIL->getLine(), CurrentDIL->getColumn(), NewScope, + CurrentDIL->getInlinedAt())); + Changed = true; + } else { + FirstDIL = CurrentDIL; } - DEBUG(dbgs() << "\n"); - Changed = true; + } else { + FirstDIL = CurrentDIL; } } } diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index ef7dacac79cb..a5137e933e83 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -41,8 +41,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { // Loop through all of our successors and make sure they know that one // of their predecessors is going away. - for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) - BBTerm->getSuccessor(i)->removePredecessor(BB); + for (BasicBlock *Succ : BBTerm->successors()) + Succ->removePredecessor(BB); // Zap all the instructions in the block. while (!BB->empty()) { @@ -65,7 +65,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { /// any single-entry PHI nodes in it, fold them away. This handles the case /// when all entries to the PHI nodes in a block are guaranteed equal, such as /// when the block has exactly one predecessor. -void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, +void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, MemoryDependenceAnalysis *MemDep) { if (!isa<PHINode>(BB->begin())) return; @@ -77,8 +77,6 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, if (MemDep) MemDep->removeInstruction(PN); // Memdep updates AA itself. - else if (AA && isa<PointerType>(PN->getType())) - AA->deleteValue(PN); PN->eraseFromParent(); } @@ -108,7 +106,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor, /// if possible. The return value indicates success or failure. bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, - LoopInfo *LI, AliasAnalysis *AA, + LoopInfo *LI, MemoryDependenceAnalysis *MemDep) { // Don't merge away blocks who have their address taken. if (BB->hasAddressTaken()) return false; @@ -119,8 +117,9 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, // Don't break self-loops. if (PredBB == BB) return false; - // Don't break invokes. - if (isa<InvokeInst>(PredBB->getTerminator())) return false; + // Don't break unwinding instructions. + if (PredBB->getTerminator()->isExceptional()) + return false; succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB)); BasicBlock *OnlySucc = BB; @@ -145,7 +144,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, // Begin by getting rid of unneeded PHIs. if (isa<PHINode>(BB->front())) - FoldSingleEntryPHINodes(BB, AA, MemDep); + FoldSingleEntryPHINodes(BB, MemDep); // Delete the unconditional branch from the predecessor... PredBB->getInstList().pop_back(); @@ -253,7 +252,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, // block. assert(SP == BB && "CFG broken"); SP = nullptr; - return SplitBlock(Succ, Succ->begin(), DT, LI); + return SplitBlock(Succ, &Succ->front(), DT, LI); } // Otherwise, if BB has a single successor, split it at the bottom of the @@ -284,8 +283,8 @@ llvm::SplitAllCriticalEdges(Function &F, /// BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI) { - BasicBlock::iterator SplitIt = SplitPt; - while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt)) + BasicBlock::iterator SplitIt = SplitPt->getIterator(); + while (isa<PHINode>(SplitIt) || SplitIt->isEHPad()) ++SplitIt; BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split"); @@ -393,7 +392,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, /// from NewBB. This also updates AliasAnalysis, if available. static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, ArrayRef<BasicBlock *> Preds, BranchInst *BI, - AliasAnalysis *AA, bool HasLoopExit) { + bool HasLoopExit) { // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB. SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end()); for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { @@ -474,17 +473,20 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, /// BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, - const char *Suffix, AliasAnalysis *AA, - DominatorTree *DT, LoopInfo *LI, - bool PreserveLCSSA) { + const char *Suffix, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { + // Do not attempt to split that which cannot be split. + if (!BB->canSplitPredecessors()) + return nullptr; + // For the landingpads we need to act a bit differently. // Delegate this work to the SplitLandingPadPredecessors. if (BB->isLandingPad()) { SmallVector<BasicBlock*, 2> NewBBs; std::string NewName = std::string(Suffix) + ".split-lp"; - SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), - NewBBs, AA, DT, LI, PreserveLCSSA); + SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs, DT, + LI, PreserveLCSSA); return NewBBs[0]; } @@ -523,7 +525,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, HasLoopExit); // Update the PHI nodes in BB with the values coming from NewBB. - UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit); + UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit); return NewBB; } @@ -544,8 +546,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix1, const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs, - AliasAnalysis *AA, DominatorTree *DT, - LoopInfo *LI, bool PreserveLCSSA) { + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!"); // Create a new basic block for OrigBB's predecessors listed in Preds. Insert @@ -574,7 +576,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB1. - UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit); // Move the remaining edges from OrigBB to point to NewBB2. SmallVector<BasicBlock*, 8> NewBB2Preds; @@ -611,7 +613,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, PreserveLCSSA, HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB2. - UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit); } LandingPadInst *LPad = OrigBB->getLandingPadInst(); @@ -661,7 +663,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, // return instruction. V = BCI->getOperand(0); NewBC = BCI->clone(); - Pred->getInstList().insert(NewRet, NewBC); + Pred->getInstList().insert(NewRet->getIterator(), NewBC); *i = NewBC; } if (PHINode *PN = dyn_cast<PHINode>(V)) { @@ -707,7 +709,7 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond, MDNode *BranchWeights, DominatorTree *DT) { BasicBlock *Head = SplitBefore->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); TerminatorInst *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); @@ -757,7 +759,7 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore, TerminatorInst **ElseTerm, MDNode *BranchWeights) { BasicBlock *Head = SplitBefore->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); TerminatorInst *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 7e83c9eeceb7..95825991cee9 100644 --- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -101,10 +101,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, continue; // Otherwise a new PHI is needed. Create one and populate it. - PHINode *NewPN = - PHINode::Create(PN->getType(), Preds.size(), "split", - SplitBB->isLandingPad() ? - SplitBB->begin() : SplitBB->getTerminator()); + PHINode *NewPN = PHINode::Create( + PN->getType(), Preds.size(), "split", + SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator()); for (unsigned i = 0, e = Preds.size(); i != e; ++i) NewPN->addIncoming(V, Preds[i]); @@ -141,9 +140,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, BasicBlock *TIBB = TI->getParent(); BasicBlock *DestBB = TI->getSuccessor(SuccNum); - // Splitting the critical edge to a landing pad block is non-trivial. Don't do + // Splitting the critical edge to a pad block is non-trivial. Don't do // it in this generic function. - if (DestBB->isLandingPad()) return nullptr; + if (DestBB->isEHPad()) return nullptr; // Create a new basic block, linking it into the CFG. BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), @@ -157,7 +156,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // Insert the block into the function... right after the block TI lives in. Function &F = *TIBB->getParent(); - Function::iterator FBBI = TIBB; + Function::iterator FBBI = TIBB->getIterator(); F.getBasicBlockList().insert(++FBBI, NewBB); // If there are any PHI nodes in DestBB, we need to update them so that they @@ -197,7 +196,6 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, } // If we have nothing to update, just return. - auto *AA = Options.AA; auto *DT = Options.DT; auto *LI = Options.LI; if (!DT && !LI) @@ -319,10 +317,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, LoopPreds.push_back(P); } if (!LoopPreds.empty()) { - assert(!DestBB->isLandingPad() && - "We don't split edges to landing pads!"); + assert(!DestBB->isEHPad() && "We don't split edges to EH pads!"); BasicBlock *NewExitBB = SplitBlockPredecessors( - DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA); + DestBB, LoopPreds, "split", DT, LI, Options.PreserveLCSSA); if (Options.PreserveLCSSA) createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB); } diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 8aa7b2a65ba9..64b44a6b7919 100644 --- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -21,7 +22,6 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; @@ -55,32 +55,6 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL, return CI; } -/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the -/// specified pointer. Ptr is required to be some pointer type, MaxLen must -/// be of size_t type, and the return value has 'intptr_t' type. -Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, - const DataLayout &DL, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc::strnlen)) - return nullptr; - - Module *M = B.GetInsertBlock()->getParent()->getParent(); - AttributeSet AS[2]; - AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture); - Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; - AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); - - LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrNLen = - M->getOrInsertFunction("strnlen", AttributeSet::get(M->getContext(), AS), - DL.getIntPtrType(Context), B.getInt8PtrTy(), - DL.getIntPtrType(Context), nullptr); - CallInst *CI = B.CreateCall(StrNLen, {CastToCStr(Ptr, B), MaxLen}, "strnlen"); - if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts())) - CI->setCallingConv(F->getCallingConv()); - - return CI; -} - /// EmitStrChr - Emit a call to the strchr function to the builder, for the /// specified pointer and character. Ptr is required to be some pointer type, /// and the return value has 'i8*' type. diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index f2d5e0745035..0914699a2e38 100644 --- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -82,7 +82,7 @@ static bool insertFastDiv(Function &F, bool UseSignedOp, DivCacheTy &PerBBDivCache) { // Get instruction operands - Instruction *Instr = J; + Instruction *Instr = &*J; Value *Dividend = Instr->getOperand(0); Value *Divisor = Instr->getOperand(1); @@ -94,7 +94,7 @@ static bool insertFastDiv(Function &F, } // Basic Block is split before divide - BasicBlock *MainBB = I; + BasicBlock *MainBB = &*I; BasicBlock *SuccessorBB = I->splitBasicBlock(J); ++I; //advance iterator I to successorBB @@ -190,7 +190,7 @@ static bool reuseOrInsertFastDiv(Function &F, bool UseSignedOp, DivCacheTy &PerBBDivCache) { // Get instruction operands - Instruction *Instr = J; + Instruction *Instr = &*J; DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1)); DivCacheTy::iterator CacheI = PerBBDivCache.find(Key); diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index cc4d6c6fb192..854a3b855f54 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -52,8 +52,8 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); - VMap[II] = NewInst; // Add instruction map to value. - + VMap[&*II] = NewInst; // Add instruction map to value. + hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { if (isa<ConstantInt>(AI->getArraySize())) @@ -85,9 +85,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, assert(NameSuffix && "NameSuffix cannot be null!"); #ifndef NDEBUG - for (Function::const_arg_iterator I = OldFunc->arg_begin(), - E = OldFunc->arg_end(); I != E; ++I) - assert(VMap.count(I) && "No mapping from source argument specified!"); + for (const Argument &I : OldFunc->args()) + assert(VMap.count(&I) && "No mapping from source argument specified!"); #endif // Copy all attributes other than those stored in the AttributeSet. We need @@ -96,6 +95,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, NewFunc->copyAttributesFrom(OldFunc); NewFunc->setAttributes(NewAttrs); + // Fix up the personality function that got copied over. + if (OldFunc->hasPersonalityFn()) + NewFunc->setPersonalityFn( + MapValue(OldFunc->getPersonalityFn(), VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); + AttributeSet OldAttrs = OldFunc->getAttributes(); // Clone any argument attributes that are present in the VMap. for (const Argument &OldArg : OldFunc->args()) @@ -136,7 +142,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, if (BB.hasAddressTaken()) { Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc), const_cast<BasicBlock*>(&BB)); - VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); + VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); } // Note return instructions for the caller. @@ -146,11 +152,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, // Loop over all of the instructions in the function, fixing up operand // references as we go. This uses VMap to do all the hard work. - for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]), - BE = NewFunc->end(); BB != BE; ++BB) + for (Function::iterator BB = + cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(), + BE = NewFunc->end(); + BB != BE; ++BB) // Loop over all instructions, fixing each one as we find it... - for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II) - RemapInstruction(II, VMap, + for (Instruction &II : *BB) + RemapInstruction(&II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); } @@ -187,11 +195,9 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder); if (!OldSubprogramMDNode) return; - // Ensure that OldFunc appears in the map. - // (if it's already there it must point to NewFunc anyway) - VMap[OldFunc] = NewFunc; auto *NewSubprogram = cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap)); + NewFunc->setSubprogram(NewSubprogram); for (auto *CU : Finder.compile_units()) { auto Subprograms = CU->getSubprograms(); @@ -222,10 +228,9 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, // The user might be deleting arguments to the function by specifying them in // the VMap. If so, we need to not add the arguments to the arg ty vector // - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) - if (VMap.count(I) == 0) // Haven't mapped the argument to anything yet? - ArgTypes.push_back(I->getType()); + for (const Argument &I : F->args()) + if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet? + ArgTypes.push_back(I.getType()); // Create a new function type... FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), @@ -236,11 +241,10 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, // Loop over the arguments, copying the names of the mapped arguments over... Function::arg_iterator DestI = NewF->arg_begin(); - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) - if (VMap.count(I) == 0) { // Is this argument preserved? - DestI->setName(I->getName()); // Copy the name over... - VMap[I] = DestI++; // Add mapping to VMap + for (const Argument & I : F->args()) + if (VMap.count(&I) == 0) { // Is this argument preserved? + DestI->setName(I.getName()); // Copy the name over... + VMap[&I] = &*DestI++; // Add mapping to VMap } if (ModuleLevelChanges) @@ -330,8 +334,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, II != IE; ++II) { // If the "Director" remaps the instruction, don't clone it. if (Director) { - CloningDirector::CloningAction Action - = Director->handleInstruction(VMap, II, NewBB); + CloningDirector::CloningAction Action = + Director->handleInstruction(VMap, &*II, NewBB); // If the cloning director says stop, we want to stop everything, not // just break out of the loop (which would cause the terminator to be // cloned). The cloning director is responsible for inserting a proper @@ -365,7 +369,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, if (Value *MappedV = VMap.lookup(V)) V = MappedV; - VMap[II] = V; + VMap[&*II] = V; delete NewInst; continue; } @@ -373,9 +377,15 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); - VMap[II] = NewInst; // Add instruction map to value. + VMap[&*II] = NewInst; // Add instruction map to value. NewBB->getInstList().push_back(NewInst); hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); + + if (CodeInfo) + if (auto CS = ImmutableCallSite(&*II)) + if (CS.hasOperandBundles()) + CodeInfo->OperandBundleCallSites.push_back(NewInst); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { if (isa<ConstantInt>(AI->getArraySize())) hasStaticAllocas = true; @@ -400,8 +410,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // If the director says to skip with a terminate instruction, we still // need to clone this block's successors. const TerminatorInst *TI = NewBB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - ToClone.push_back(TI->getSuccessor(i)); + for (const BasicBlock *Succ : TI->successors()) + ToClone.push_back(Succ); return; } assert(Action != CloningDirector::SkipInstruction && @@ -447,11 +457,16 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, NewInst->setName(OldTI->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); VMap[OldTI] = NewInst; // Add instruction map to value. - + + if (CodeInfo) + if (auto CS = ImmutableCallSite(OldTI)) + if (CS.hasOperandBundles()) + CodeInfo->OperandBundleCallSites.push_back(NewInst); + // Recursively clone any reachable successor blocks. const TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - ToClone.push_back(TI->getSuccessor(i)); + for (const BasicBlock *Succ : TI->successors()) + ToClone.push_back(Succ); } if (CodeInfo) { @@ -484,12 +499,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, } #ifndef NDEBUG - // If the cloning starts at the begining of the function, verify that + // If the cloning starts at the beginning of the function, verify that // the function arguments are mapped. if (!StartingInst) - for (Function::const_arg_iterator II = OldFunc->arg_begin(), - E = OldFunc->arg_end(); II != E; ++II) - assert(VMap.count(II) && "No mapping from source argument specified!"); + for (const Argument &II : OldFunc->args()) + assert(VMap.count(&II) && "No mapping from source argument specified!"); #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, @@ -499,12 +513,12 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, StartingBB = StartingInst->getParent(); else { StartingBB = &OldFunc->getEntryBlock(); - StartingInst = StartingBB->begin(); + StartingInst = &StartingBB->front(); } // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; - PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist); + PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); @@ -517,9 +531,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // // Defer PHI resolution until rest of function is resolved. SmallVector<const PHINode*, 16> PHIToResolve; - for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); - BI != BE; ++BI) { - Value *V = VMap[BI]; + for (const BasicBlock &BI : *OldFunc) { + Value *V = VMap[&BI]; BasicBlock *NewBB = cast_or_null<BasicBlock>(V); if (!NewBB) continue; // Dead block. @@ -528,7 +541,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Handle PHI nodes specially, as we have to remove references to dead // blocks. - for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) { + for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) { // PHI nodes may have been remapped to non-PHI nodes by the caller or // during the cloning process. if (const PHINode *PN = dyn_cast<PHINode>(I)) { @@ -621,8 +634,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, while ((PN = dyn_cast<PHINode>(I++))) { Value *NV = UndefValue::get(PN->getType()); PN->replaceAllUsesWith(NV); - assert(VMap[OldI] == PN && "VMap mismatch"); - VMap[OldI] = NV; + assert(VMap[&*OldI] == PN && "VMap mismatch"); + VMap[&*OldI] = NV; PN->eraseFromParent(); ++OldI; } @@ -644,15 +657,15 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // and zap unconditional fall-through branches. This happens all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. - Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]); + Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator(); Function::iterator I = Begin; while (I != NewFunc->end()) { // Check if this block has become dead during inlining or other // simplifications. Note that the first block will appear dead, as it has // not yet been wired up properly. - if (I != Begin && (pred_begin(I) == pred_end(I) || - I->getSinglePredecessor() == I)) { - BasicBlock *DeadBB = I++; + if (I != Begin && (pred_begin(&*I) == pred_end(&*I) || + I->getSinglePredecessor() == &*I)) { + BasicBlock *DeadBB = &*I++; DeleteDeadBlock(DeadBB); continue; } @@ -662,7 +675,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // simplification required looking through PHI nodes, those are only // available after forming the full basic block. That may leave some here, // and we still want to prune the dead code as early as possible. - ConstantFoldTerminator(I); + ConstantFoldTerminator(&*I); BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator()); if (!BI || BI->isConditional()) { ++I; continue; } @@ -681,7 +694,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, BI->eraseFromParent(); // Make all PHI nodes that referred to Dest now refer to I as their source. - Dest->replaceAllUsesWith(I); + Dest->replaceAllUsesWith(&*I); // Move all the instructions in the succ to the pred. I->getInstList().splice(I->end(), Dest->getInstList()); @@ -695,7 +708,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Make a final pass over the basic blocks from the old function to gather // any return instructions which survived folding. We have to do this here // because we can iteratively remove and merge returns above. - for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]), + for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(), E = NewFunc->end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) @@ -717,7 +730,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall) { - CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), VMap, + CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, ModuleLevelChanges, Returns, NameSuffix, CodeInfo, nullptr); } @@ -780,9 +793,10 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, } // Move them physically from the end of the block list. - F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); - F->getBasicBlockList().splice(Before, F->getBasicBlockList(), - NewLoop->getHeader(), F->end()); + F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), + NewPH); + F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), + NewLoop->getHeader()->getIterator(), F->end()); return NewLoop; } diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp index 61f1811e7b4a..ab083353ece6 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -20,21 +20,28 @@ #include "llvm-c/Core.h" using namespace llvm; -/// CloneModule - Return an exact copy of the specified module. This is not as -/// easy as it might seem because we have to worry about making copies of global -/// variables and functions, and making their (initializers and references, -/// respectively) refer to the right globals. +/// This is not as easy as it might seem because we have to worry about making +/// copies of global variables and functions, and making their (initializers and +/// references, respectively) refer to the right globals. /// -Module *llvm::CloneModule(const Module *M) { +std::unique_ptr<Module> llvm::CloneModule(const Module *M) { // Create the value map that maps things from the old module over to the new // module. ValueToValueMapTy VMap; return CloneModule(M, VMap); } -Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { +std::unique_ptr<Module> llvm::CloneModule(const Module *M, + ValueToValueMapTy &VMap) { + return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; }); +} + +std::unique_ptr<Module> llvm::CloneModule( + const Module *M, ValueToValueMapTy &VMap, + std::function<bool(const GlobalValue *)> ShouldCloneDefinition) { // First off, we need to create the new module. - Module *New = new Module(M->getModuleIdentifier(), M->getContext()); + std::unique_ptr<Module> New = + llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext()); New->setDataLayout(M->getDataLayout()); New->setTargetTriple(M->getTargetTriple()); New->setModuleInlineAsm(M->getModuleInlineAsm()); @@ -52,26 +59,48 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { (GlobalVariable*) nullptr, I->getThreadLocalMode(), I->getType()->getAddressSpace()); - GV->copyAttributesFrom(I); - VMap[I] = GV; + GV->copyAttributesFrom(&*I); + VMap[&*I] = GV; } // Loop over the functions in the module, making external functions as before for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { Function *NF = - Function::Create(cast<FunctionType>(I->getType()->getElementType()), - I->getLinkage(), I->getName(), New); - NF->copyAttributesFrom(I); - VMap[I] = NF; + Function::Create(cast<FunctionType>(I->getType()->getElementType()), + I->getLinkage(), I->getName(), New.get()); + NF->copyAttributesFrom(&*I); + VMap[&*I] = NF; } // Loop over the aliases in the module for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E; ++I) { - auto *PTy = cast<PointerType>(I->getType()); - auto *GA = GlobalAlias::create(PTy, I->getLinkage(), I->getName(), New); - GA->copyAttributesFrom(I); - VMap[I] = GA; + if (!ShouldCloneDefinition(&*I)) { + // An alias cannot act as an external reference, so we need to create + // either a function or a global variable depending on the value type. + // FIXME: Once pointee types are gone we can probably pick one or the + // other. + GlobalValue *GV; + if (I->getValueType()->isFunctionTy()) + GV = Function::Create(cast<FunctionType>(I->getValueType()), + GlobalValue::ExternalLinkage, I->getName(), + New.get()); + else + GV = new GlobalVariable( + *New, I->getValueType(), false, GlobalValue::ExternalLinkage, + (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr, + I->getThreadLocalMode(), I->getType()->getAddressSpace()); + VMap[&*I] = GV; + // We do not copy attributes (mainly because copying between different + // kinds of globals is forbidden), but this is generally not required for + // correctness. + continue; + } + auto *GA = GlobalAlias::create(I->getValueType(), + I->getType()->getPointerAddressSpace(), + I->getLinkage(), I->getName(), New.get()); + GA->copyAttributesFrom(&*I); + VMap[&*I] = GA; } // Now that all of the things that global variable initializer can refer to @@ -80,7 +109,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // for (Module::const_global_iterator I = M->global_begin(), E = M->global_end(); I != E; ++I) { - GlobalVariable *GV = cast<GlobalVariable>(VMap[I]); + GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]); + if (!ShouldCloneDefinition(&*I)) { + // Skip after setting the correct linkage for an external reference. + GV->setLinkage(GlobalValue::ExternalLinkage); + continue; + } if (I->hasInitializer()) GV->setInitializer(MapValue(I->getInitializer(), VMap)); } @@ -88,18 +122,22 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // Similarly, copy over function bodies now... // for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { - Function *F = cast<Function>(VMap[I]); + Function *F = cast<Function>(VMap[&*I]); + if (!ShouldCloneDefinition(&*I)) { + // Skip after setting the correct linkage for an external reference. + F->setLinkage(GlobalValue::ExternalLinkage); + continue; + } if (!I->isDeclaration()) { Function::arg_iterator DestI = F->arg_begin(); for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end(); ++J) { DestI->setName(J->getName()); - VMap[J] = DestI++; + VMap[&*J] = &*DestI++; } SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned. - CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns); - + CloneFunctionInto(F, &*I, VMap, /*ModuleLevelChanges=*/true, Returns); } if (I->hasPersonalityFn()) @@ -109,7 +147,10 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // And aliases for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E; ++I) { - GlobalAlias *GA = cast<GlobalAlias>(VMap[I]); + // We already dealt with undefined aliases above. + if (!ShouldCloneDefinition(&*I)) + continue; + GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]); if (const Constant *C = I->getAliasee()) GA->setAliasee(MapValue(C, VMap)); } @@ -129,7 +170,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { extern "C" { LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) { - return wrap(CloneModule(unwrap(M))); + return wrap(CloneModule(unwrap(M)).release()); } } diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp index ab89b41f6788..823696d88e65 100644 --- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -51,7 +51,7 @@ AggregateArgsOpt("aggregate-extracted-args", cl::Hidden, /// \brief Test whether a block is valid for extraction. static bool isBlockValidForExtraction(const BasicBlock &BB) { // Landing pads must be in the function where they were inserted for cleanup. - if (BB.isLandingPad()) + if (BB.isEHPad()) return false; // Don't hoist code containing allocas, invokes, or vastarts. @@ -175,7 +175,7 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs, for (User *U : II->users()) if (!definedInRegion(Blocks, U)) { - Outputs.insert(II); + Outputs.insert(&*II); break; } } @@ -211,7 +211,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // containing PHI nodes merging values from outside of the region, and a // second that contains all of the code for the block and merges back any // incoming values from inside of the region. - BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI(); + BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI()->getIterator(); BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs, Header->getName()+".ce"); @@ -246,7 +246,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Create a new PHI node in the new region, which has an incoming value // from OldPred of PN. PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion, - PN->getName()+".ce", NewBB->begin()); + PN->getName() + ".ce", &NewBB->front()); NewPN->addIncoming(PN, OldPred); // Loop over all of the incoming value in PN, moving them to NewPN if they @@ -266,7 +266,8 @@ void CodeExtractor::splitReturnBlocks() { for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) { - BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret"); + BasicBlock *New = + (*I)->splitBasicBlock(RI->getIterator(), (*I)->getName() + ".ret"); if (DT) { // Old dominates New. New node dominates all other nodes dominated // by Old. @@ -365,10 +366,10 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); TerminatorInst *TI = newFunction->begin()->getTerminator(); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructTy, AI, Idx, "gep_" + inputs[i]->getName(), TI); + StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI); RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI); } else - RewriteVal = AI++; + RewriteVal = &*AI++; std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end()); for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); @@ -440,8 +441,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, StructValues.push_back(*i); } else { AllocaInst *alloca = - new AllocaInst((*i)->getType(), nullptr, (*i)->getName()+".loc", - codeReplacer->getParent()->begin()->begin()); + new AllocaInst((*i)->getType(), nullptr, (*i)->getName() + ".loc", + &codeReplacer->getParent()->front().front()); ReloadOutputs.push_back(alloca); params.push_back(alloca); } @@ -457,9 +458,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, // Allocate a struct at the beginning of this function StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); - Struct = - new AllocaInst(StructArgTy, nullptr, "structArg", - codeReplacer->getParent()->begin()->begin()); + Struct = new AllocaInst(StructArgTy, nullptr, "structArg", + &codeReplacer->getParent()->front().front()); params.push_back(Struct); for (unsigned i = 0, e = inputs.size(); i != e; ++i) { @@ -566,8 +566,12 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, bool DominatesDef = true; - if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) { - DefBlock = Invoke->getNormalDest(); + BasicBlock *NormalDest = nullptr; + if (auto *Invoke = dyn_cast<InvokeInst>(outputs[out])) + NormalDest = Invoke->getNormalDest(); + + if (NormalDest) { + DefBlock = NormalDest; // Make sure we are looking at the original successor block, not // at a newly inserted exit block, which won't be in the dominator @@ -606,11 +610,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut+out); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, OAI, Idx, "gep_" + outputs[out]->getName(), + StructArgTy, &*OAI, Idx, "gep_" + outputs[out]->getName(), NTRet); new StoreInst(outputs[out], GEP, NTRet); } else { - new StoreInst(outputs[out], OAI, NTRet); + new StoreInst(outputs[out], &*OAI, NTRet); } } // Advance output iterator even if we don't emit a store diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp index dc95089cd2ca..b56ff684e8a8 100644 --- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp @@ -50,7 +50,7 @@ void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) { GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), CA, "", GCL->getThreadLocalMode()); - GCL->getParent()->getGlobalList().insert(GCL, NGV); + GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV); NGV->takeName(GCL); // Nuke the old list, replacing any uses with the new one. diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index 003da58ee798..75a1dde57c4c 100644 --- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -35,8 +35,8 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, I.getName()+".reg2mem", AllocaPoint); } else { Function *F = I.getParent()->getParent(); - Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem", - F->getEntryBlock().begin()); + Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem", + &F->getEntryBlock().front()); } // We cannot demote invoke instructions to the stack if their normal edge @@ -89,16 +89,15 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, // AFTER the terminator instruction. BasicBlock::iterator InsertPt; if (!isa<TerminatorInst>(I)) { - InsertPt = &I; - ++InsertPt; - for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + InsertPt = ++I.getIterator(); + for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) /* empty */; // Don't insert before PHI nodes or landingpad instrs. } else { InvokeInst &II = cast<InvokeInst>(I); InsertPt = II.getNormalDest()->getFirstInsertionPt(); } - new StoreInst(&I, Slot, InsertPt); + new StoreInst(&I, Slot, &*InsertPt); return Slot; } @@ -118,8 +117,8 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { P->getName()+".reg2mem", AllocaPoint); } else { Function *F = P->getParent()->getParent(); - Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem", - F->getEntryBlock().begin()); + Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem", + &F->getEntryBlock().front()); } // Iterate over each operand inserting a store in each predecessor. @@ -133,12 +132,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { } // Insert a load in place of the PHI and replace all uses. - BasicBlock::iterator InsertPt = P; + BasicBlock::iterator InsertPt = P->getIterator(); - for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) /* empty */; // Don't insert before PHI nodes or landingpad instrs. - Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt); + Value *V = new LoadInst(Slot, P->getName() + ".reload", &*InsertPt); P->replaceAllUsesWith(V); // Delete PHI. diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 4eb3e3dd17d2..492ae9f69a65 100644 --- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -28,12 +28,11 @@ class FlattenCFGOpt { AliasAnalysis *AA; /// \brief Use parallel-and or parallel-or to generate conditions for /// conditional branches. - bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, - Pass *P = nullptr); + bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder); /// \brief If \param BB is the merge block of an if-region, attempt to merge /// the if-region with an adjacent if-region upstream if two if-regions /// contain identical instructions. - bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = nullptr); + bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder); /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which /// are from two if-regions whose entry blocks are \p Head1 and \p /// Head2. \returns true if \p Block1 and \p Block2 contain identical @@ -122,8 +121,7 @@ public: /// its predecessor. In Case 2, \param BB (BB3) only has conditional branches /// as its predecessors. /// -bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, - Pass *P) { +bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { PHINode *PHI = dyn_cast<PHINode>(BB->begin()); if (PHI) return false; // For simplicity, avoid cases containing PHI nodes. @@ -177,8 +175,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, // Instructions in the internal condition blocks should be safe // to hoist up. - for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) { - Instruction *CI = BI++; + for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator(); + BI != BE;) { + Instruction *CI = &*BI++; if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI)) return false; } @@ -315,7 +314,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, BasicBlock *Block1, BasicBlock *Block2) { TerminatorInst *PTI2 = Head2->getTerminator(); - Instruction *PBI2 = Head2->begin(); + Instruction *PBI2 = &Head2->front(); bool eq1 = (Block1 == Head1); bool eq2 = (Block2 == Head2); @@ -327,9 +326,9 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, // Check whether instructions in Block1 and Block2 are identical // and do not alias with instructions in Head2. BasicBlock::iterator iter1 = Block1->begin(); - BasicBlock::iterator end1 = Block1->getTerminator(); + BasicBlock::iterator end1 = Block1->getTerminator()->getIterator(); BasicBlock::iterator iter2 = Block2->begin(); - BasicBlock::iterator end2 = Block2->getTerminator(); + BasicBlock::iterator end2 = Block2->getTerminator()->getIterator(); while (1) { if (iter1 == end1) { @@ -338,7 +337,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, break; } - if (!iter1->isIdenticalTo(iter2)) + if (!iter1->isIdenticalTo(&*iter2)) return false; // Illegal to remove instructions with side effects except @@ -356,10 +355,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, return false; if (iter1->mayWriteToMemory()) { - for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { + for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) { // Check alias with Head2. - if (!AA || AA->alias(iter1, BI)) + if (!AA || AA->alias(&*iter1, &*BI)) return false; } } @@ -386,8 +385,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, /// if (a || b) /// statement; /// -bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, - Pass *P) { +bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { BasicBlock *IfTrue2, *IfFalse2; Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2); Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2); @@ -413,7 +411,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, return false; TerminatorInst *PTI2 = SecondEntryBlock->getTerminator(); - Instruction *PBI2 = SecondEntryBlock->begin(); + Instruction *PBI2 = &SecondEntryBlock->front(); if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1, IfTrue2)) @@ -425,8 +423,8 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, // Check whether \param SecondEntryBlock has side-effect and is safe to // speculate. - for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { - Instruction *CI = BI; + for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { + Instruction *CI = &*BI; if (isa<PHINode>(CI) || CI->mayHaveSideEffects() || !isSafeToSpeculativelyExecute(CI)) return false; diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 44b7d25d519a..3893a752503b 100644 --- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -49,6 +49,10 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) { static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, SmallPtrSetImpl<const PHINode *> &PhiUsers) { + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (GV->isExternallyInitialized()) + GS.StoredType = GlobalStatus::StoredOnce; + for (const Use &U : V->uses()) { const User *UR = U.getUser(); if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index d2d60d7cd9f6..14574119b9a8 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -13,14 +13,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Attributes.h" @@ -41,6 +42,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/CommandLine.h" #include <algorithm> + using namespace llvm; static cl::opt<bool> @@ -54,17 +56,17 @@ PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining", cl::desc("Convert align attributes to assumptions during inlining.")); bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, - bool InsertLifetime) { - return InlineFunction(CallSite(CI), IFI, InsertLifetime); + AAResults *CalleeAAR, bool InsertLifetime) { + return InlineFunction(CallSite(CI), IFI, CalleeAAR, InsertLifetime); } bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, - bool InsertLifetime) { - return InlineFunction(CallSite(II), IFI, InsertLifetime); + AAResults *CalleeAAR, bool InsertLifetime) { + return InlineFunction(CallSite(II), IFI, CalleeAAR, InsertLifetime); } namespace { - /// A class for recording information about inlining through an invoke. - class InvokeInliningInfo { + /// A class for recording information about inlining a landing pad. + class LandingPadInliningInfo { BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind. BasicBlock *InnerResumeDest; ///< Destination for the callee's resume. LandingPadInst *CallerLPad; ///< LandingPadInst associated with the invoke. @@ -72,7 +74,7 @@ namespace { SmallVector<Value*, 8> UnwindDestPHIValues; public: - InvokeInliningInfo(InvokeInst *II) + LandingPadInliningInfo(InvokeInst *II) : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr), CallerLPad(nullptr), InnerEHValuesPHI(nullptr) { // If there are PHI nodes in the unwind destination block, we need to keep @@ -121,14 +123,14 @@ namespace { } } }; -} +} // anonymous namespace /// Get or create a target for the branch from ResumeInsts. -BasicBlock *InvokeInliningInfo::getInnerResumeDest() { +BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { if (InnerResumeDest) return InnerResumeDest; // Split the landing pad. - BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint; + BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator(); InnerResumeDest = OuterResumeDest->splitBasicBlock(SplitPoint, OuterResumeDest->getName() + ".body"); @@ -137,7 +139,7 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() { const unsigned PHICapacity = 2; // Create corresponding new PHIs for all the PHIs in the outer landing pad. - BasicBlock::iterator InsertPoint = InnerResumeDest->begin(); + Instruction *InsertPoint = &InnerResumeDest->front(); BasicBlock::iterator I = OuterResumeDest->begin(); for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { PHINode *OuterPHI = cast<PHINode>(I); @@ -162,8 +164,8 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() { /// When the landing pad block has only one predecessor, this is a simple /// branch. When there is more than one predecessor, we need to split the /// landing pad block after the landingpad instruction and jump to there. -void InvokeInliningInfo::forwardResume(ResumeInst *RI, - SmallPtrSetImpl<LandingPadInst*> &InlinedLPads) { +void LandingPadInliningInfo::forwardResume( + ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) { BasicBlock *Dest = getInnerResumeDest(); BasicBlock *Src = RI->getParent(); @@ -182,33 +184,39 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI, /// This function analyze BB to see if there are any calls, and if so, /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. -static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, - InvokeInliningInfo &Invoke) { +static BasicBlock * +HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) { for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { - Instruction *I = BBI++; + Instruction *I = &*BBI++; // We only need to check for function calls: inlined invoke // instructions require no special handling. CallInst *CI = dyn_cast<CallInst>(I); - // If this call cannot unwind, don't convert it to an invoke. - // Inline asm calls cannot throw. if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue())) continue; // Convert this function call into an invoke instruction. First, split the // basic block. - BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc"); + BasicBlock *Split = + BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc"); // Delete the unconditional branch inserted by splitBasicBlock BB->getInstList().pop_back(); // Create the new invoke instruction. - ImmutableCallSite CS(CI); - SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end()); - InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, - Invoke.getOuterResumeDest(), - InvokeArgs, CI->getName(), BB); + SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end()); + SmallVector<OperandBundleDef, 1> OpBundles; + + CI->getOperandBundlesAsDefs(OpBundles); + + // Note: we're round tripping operand bundles through memory here, and that + // can potentially be avoided with a cleverer API design that we do not have + // as of this time. + + InvokeInst *II = + InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge, InvokeArgs, + OpBundles, CI->getName(), BB); II->setDebugLoc(CI->getDebugLoc()); II->setCallingConv(CI->getCallingConv()); II->setAttributes(CI->getAttributes()); @@ -219,12 +227,9 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, // Delete the original call Split->getInstList().pop_front(); - - // Update any PHI nodes in the exceptional block to indicate that there is - // now a new entry in them. - Invoke.addIncomingPHIValuesFor(BB); - return; + return BB; } + return nullptr; } /// If we inlined an invoke site, we need to convert calls @@ -233,8 +238,8 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, /// II is the invoke instruction being inlined. FirstNewBlock is the first /// block of the inlined code (the last block is the end of the function), /// and InlineCodeInfo is information about the code that got inlined. -static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, - ClonedCodeInfo &InlinedCodeInfo) { +static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, + ClonedCodeInfo &InlinedCodeInfo) { BasicBlock *InvokeDest = II->getUnwindDest(); Function *Caller = FirstNewBlock->getParent(); @@ -242,11 +247,12 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, // The inlined code is currently at the end of the function, scan from the // start of the inlined code to its end, checking for stuff we need to // rewrite. - InvokeInliningInfo Invoke(II); + LandingPadInliningInfo Invoke(II); // Get all of the inlined landing pad instructions. SmallPtrSet<LandingPadInst*, 16> InlinedLPads; - for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I) + for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end(); + I != E; ++I) if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) InlinedLPads.insert(II->getLandingPadInst()); @@ -262,9 +268,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, InlinedLPad->setCleanup(true); } - for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){ + for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); + BB != E; ++BB) { if (InlinedCodeInfo.ContainsCalls) - HandleCallsInBlockInlinedThroughInvoke(BB, Invoke); + if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( + &*BB, Invoke.getOuterResumeDest())) + // Update any PHI nodes in the exceptional block to indicate that there + // is now a new entry in them. + Invoke.addIncomingPHIValuesFor(NewBB); // Forward any resumes that are remaining here. if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) @@ -278,6 +289,99 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, InvokeDest->removePredecessor(II->getParent()); } +/// If we inlined an invoke site, we need to convert calls +/// in the body of the inlined function into invokes. +/// +/// II is the invoke instruction being inlined. FirstNewBlock is the first +/// block of the inlined code (the last block is the end of the function), +/// and InlineCodeInfo is information about the code that got inlined. +static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, + ClonedCodeInfo &InlinedCodeInfo) { + BasicBlock *UnwindDest = II->getUnwindDest(); + Function *Caller = FirstNewBlock->getParent(); + + assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!"); + + // If there are PHI nodes in the unwind destination block, we need to keep + // track of which values came into them from the invoke before removing the + // edge from this block. + SmallVector<Value *, 8> UnwindDestPHIValues; + llvm::BasicBlock *InvokeBB = II->getParent(); + for (Instruction &I : *UnwindDest) { + // Save the value to use for this edge. + PHINode *PHI = dyn_cast<PHINode>(&I); + if (!PHI) + break; + UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); + } + + // Add incoming-PHI values to the unwind destination block for the given basic + // block, using the values for the original invoke's source block. + auto UpdatePHINodes = [&](BasicBlock *Src) { + BasicBlock::iterator I = UnwindDest->begin(); + for (Value *V : UnwindDestPHIValues) { + PHINode *PHI = cast<PHINode>(I); + PHI->addIncoming(V, Src); + ++I; + } + }; + + // This connects all the instructions which 'unwind to caller' to the invoke + // destination. + for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); + BB != E; ++BB) { + if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) { + if (CRI->unwindsToCaller()) { + CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI); + CRI->eraseFromParent(); + UpdatePHINodes(&*BB); + } + } + + Instruction *I = BB->getFirstNonPHI(); + if (!I->isEHPad()) + continue; + + Instruction *Replacement = nullptr; + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { + if (CatchSwitch->unwindsToCaller()) { + auto *NewCatchSwitch = CatchSwitchInst::Create( + CatchSwitch->getParentPad(), UnwindDest, + CatchSwitch->getNumHandlers(), CatchSwitch->getName(), + CatchSwitch); + for (BasicBlock *PadBB : CatchSwitch->handlers()) + NewCatchSwitch->addHandler(PadBB); + Replacement = NewCatchSwitch; + } + } else if (!isa<FuncletPadInst>(I)) { + llvm_unreachable("unexpected EHPad!"); + } + + if (Replacement) { + Replacement->takeName(I); + I->replaceAllUsesWith(Replacement); + I->eraseFromParent(); + UpdatePHINodes(&*BB); + } + } + + if (InlinedCodeInfo.ContainsCalls) + for (Function::iterator BB = FirstNewBlock->getIterator(), + E = Caller->end(); + BB != E; ++BB) + if (BasicBlock *NewBB = + HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest)) + // Update any PHI nodes in the exceptional block to indicate that there + // is now a new entry in them. + UpdatePHINodes(NewBB); + + // Now that everything is happy, we have one final detail. The PHI nodes in + // the exception destination block still have entries due to the original + // invoke instruction. Eliminate these entries (which might even delete the + // PHI node) now. + UnwindDest->removePredecessor(InvokeBB); +} + /// When inlining a function that contains noalias scope metadata, /// this metadata needs to be cloned so that the inlined blocks /// have different "unqiue scopes" at every call site. Were this not done, then @@ -395,17 +499,16 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { /// parameters with noalias metadata specifying the new scope, and tag all /// non-derived loads, stores and memory intrinsics with the new alias scopes. static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, - const DataLayout &DL, AliasAnalysis *AA) { + const DataLayout &DL, AAResults *CalleeAAR) { if (!EnableNoAliasConversion) return; const Function *CalledFunc = CS.getCalledFunction(); SmallVector<const Argument *, 4> NoAliasArgs; - for (Function::const_arg_iterator I = CalledFunc->arg_begin(), - E = CalledFunc->arg_end(); I != E; ++I) { - if (I->hasNoAliasAttr() && !I->hasNUses(0)) - NoAliasArgs.push_back(I); + for (const Argument &I : CalledFunc->args()) { + if (I.hasNoAliasAttr() && !I.hasNUses(0)) + NoAliasArgs.push_back(&I); } if (NoAliasArgs.empty()) @@ -480,10 +583,10 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, continue; IsFuncCall = true; - if (AA) { - AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(ICS); - if (MRB == AliasAnalysis::OnlyAccessesArgumentPointees || - MRB == AliasAnalysis::OnlyReadsArgumentPointees) + if (CalleeAAR) { + FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(ICS); + if (MRB == FMRB_OnlyAccessesArgumentPointees || + MRB == FMRB_OnlyReadsArgumentPointees) IsArgMemOnlyCall = true; } @@ -518,7 +621,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) { SmallVector<Value *, 4> Objects; GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]), - Objects, DL, /* MaxLookup = */ 0); + Objects, DL, /* LI = */ nullptr); for (Value *O : Objects) ObjSet.insert(O); @@ -646,7 +749,7 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { // caller, then don't bother inserting the assumption. Value *Arg = CS.getArgument(I->getArgNo()); if (getKnownAlignment(Arg, DL, CS.getInstruction(), - &IFI.ACT->getAssumptionCache(*CalledFunc), + &IFI.ACT->getAssumptionCache(*CS.getCaller()), &DT) >= Align) continue; @@ -731,7 +834,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, BasicBlock *InsertBlock, InlineFunctionInfo &IFI) { Type *AggTy = cast<PointerType>(Src->getType())->getElementType(); - IRBuilder<> Builder(InsertBlock->begin()); + IRBuilder<> Builder(InsertBlock, InsertBlock->begin()); Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy)); @@ -851,9 +954,8 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx, // Starting from the top, rebuild the nodes to point to the new inlined-at // location (then rebuilding the rest of the chain behind it) and update the // map of already-constructed inlined-at nodes. - for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend(); - I != E; ++I) { - const DILocation *MD = *I; + for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(), + InlinedAtLocations.rend())) { Last = IANodes[MD] = DILocation::getDistinct( Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); } @@ -917,7 +1019,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, /// exists in the instruction stream. Similarly this will inline a recursive /// function by one level. bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, - bool InsertLifetime) { + AAResults *CalleeAAR, bool InsertLifetime) { Instruction *TheCall = CS.getInstruction(); assert(TheCall->getParent() && TheCall->getParent()->getParent() && "Instruction not in function!"); @@ -930,6 +1032,22 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CalledFunc->isDeclaration() || // call, or call to a vararg function! CalledFunc->getFunctionType()->isVarArg()) return false; + // The inliner does not know how to inline through calls with operand bundles + // in general ... + if (CS.hasOperandBundles()) { + for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { + uint32_t Tag = CS.getOperandBundleAt(i).getTagID(); + // ... but it knows how to inline through "deopt" operand bundles ... + if (Tag == LLVMContext::OB_deopt) + continue; + // ... and "funclet" operand bundles. + if (Tag == LLVMContext::OB_funclet) + continue; + + return false; + } + } + // If the call to the callee cannot throw, set the 'nounwind' flag on any // calls that we inline. bool MarkNoUnwind = CS.doesNotThrow(); @@ -950,13 +1068,17 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Get the personality function from the callee if it contains a landing pad. Constant *CalledPersonality = - CalledFunc->hasPersonalityFn() ? CalledFunc->getPersonalityFn() : nullptr; + CalledFunc->hasPersonalityFn() + ? CalledFunc->getPersonalityFn()->stripPointerCasts() + : nullptr; // Find the personality function used by the landing pads of the caller. If it // exists, then check to see that it matches the personality function used in // the callee. Constant *CallerPersonality = - Caller->hasPersonalityFn() ? Caller->getPersonalityFn() : nullptr; + Caller->hasPersonalityFn() + ? Caller->getPersonalityFn()->stripPointerCasts() + : nullptr; if (CalledPersonality) { if (!CallerPersonality) Caller->setPersonalityFn(CalledPersonality); @@ -968,9 +1090,46 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, return false; } + // We need to figure out which funclet the callsite was in so that we may + // properly nest the callee. + Instruction *CallSiteEHPad = nullptr; + if (CallerPersonality) { + EHPersonality Personality = classifyEHPersonality(CallerPersonality); + if (isFuncletEHPersonality(Personality)) { + Optional<OperandBundleUse> ParentFunclet = + CS.getOperandBundle(LLVMContext::OB_funclet); + if (ParentFunclet) + CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); + + // OK, the inlining site is legal. What about the target function? + + if (CallSiteEHPad) { + if (Personality == EHPersonality::MSVC_CXX) { + // The MSVC personality cannot tolerate catches getting inlined into + // cleanup funclets. + if (isa<CleanupPadInst>(CallSiteEHPad)) { + // Ok, the call site is within a cleanuppad. Let's check the callee + // for catchpads. + for (const BasicBlock &CalledBB : *CalledFunc) { + if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI())) + return false; + } + } + } else if (isAsynchronousEHPersonality(Personality)) { + // SEH is even less tolerant, there may not be any sort of exceptional + // funclet in the callee. + for (const BasicBlock &CalledBB : *CalledFunc) { + if (CalledBB.isEHPad()) + return false; + } + } + } + } + } + // Get an iterator to the last basic block in the function, which will have // the new function inlined after it. - Function::iterator LastBlock = &Caller->back(); + Function::iterator LastBlock = --Caller->end(); // Make sure to capture all of the return instructions from the cloned // function. @@ -1007,7 +1166,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI)); } - VMap[I] = ActualArg; + VMap[&*I] = ActualArg; } // Add alignment assumptions if necessary. We do this before the inlined @@ -1029,7 +1188,61 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Inject byval arguments initialization. for (std::pair<Value*, Value*> &Init : ByValInit) HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), - FirstNewBlock, IFI); + &*FirstNewBlock, IFI); + + Optional<OperandBundleUse> ParentDeopt = + CS.getOperandBundle(LLVMContext::OB_deopt); + if (ParentDeopt) { + SmallVector<OperandBundleDef, 2> OpDefs; + + for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) { + Instruction *I = dyn_cast_or_null<Instruction>(VH); + if (!I) continue; // instruction was DCE'd or RAUW'ed to undef + + OpDefs.clear(); + + CallSite ICS(I); + OpDefs.reserve(ICS.getNumOperandBundles()); + + for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) { + auto ChildOB = ICS.getOperandBundleAt(i); + if (ChildOB.getTagID() != LLVMContext::OB_deopt) { + // If the inlined call has other operand bundles, let them be + OpDefs.emplace_back(ChildOB); + continue; + } + + // It may be useful to separate this logic (of handling operand + // bundles) out to a separate "policy" component if this gets crowded. + // Prepend the parent's deoptimization continuation to the newly + // inlined call's deoptimization continuation. + std::vector<Value *> MergedDeoptArgs; + MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() + + ChildOB.Inputs.size()); + + MergedDeoptArgs.insert(MergedDeoptArgs.end(), + ParentDeopt->Inputs.begin(), + ParentDeopt->Inputs.end()); + MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(), + ChildOB.Inputs.end()); + + OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs)); + } + + Instruction *NewI = nullptr; + if (isa<CallInst>(I)) + NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I); + else + NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I); + + // Note: the RAUW does the appropriate fixup in VMap, so we need to do + // this even if the call returns void. + I->replaceAllUsesWith(NewI); + + VH = nullptr; + I->eraseFromParent(); + } + } // Update the callgraph if requested. if (IFI.CG) @@ -1042,7 +1255,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CloneAliasScopeMetadata(CS, VMap); // Add noalias metadata if necessary. - AddAliasScopeMetadata(CS, VMap, DL, IFI.AA); + AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. @@ -1085,9 +1298,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Transfer all of the allocas over in a block. Using splice means // that the instructions aren't removed from the symbol table, then // reinserted. - Caller->getEntryBlock().getInstList().splice(InsertPoint, - FirstNewBlock->getInstList(), - AI, I); + Caller->getEntryBlock().getInstList().splice( + InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); } // Move any dbg.declares describing the allocas into the entry basic block. DIBuilder DIB(*Caller->getParent()); @@ -1137,7 +1349,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Leave lifetime markers for the static alloca's, scoping them to the // function we just inlined. if (InsertLifetime && !IFI.StaticAllocas.empty()) { - IRBuilder<> builder(FirstNewBlock->begin()); + IRBuilder<> builder(&FirstNewBlock->front()); for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) { AllocaInst *AI = IFI.StaticAllocas[ai]; @@ -1189,7 +1401,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); // Insert the llvm.stacksave. - CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin()) + CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin()) .CreateCall(StackSave, {}, "savedstack"); // Insert a call to llvm.stackrestore before any return instructions in the @@ -1203,10 +1415,74 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } + // Update the lexical scopes of the new funclets and callsites. + // Anything that had 'none' as its parent is now nested inside the callsite's + // EHPad. + + if (CallSiteEHPad) { + for (Function::iterator BB = FirstNewBlock->getIterator(), + E = Caller->end(); + BB != E; ++BB) { + // Add bundle operands to any top-level call sites. + SmallVector<OperandBundleDef, 1> OpBundles; + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { + Instruction *I = &*BBI++; + CallSite CS(I); + if (!CS) + continue; + + // Skip call sites which are nounwind intrinsics. + auto *CalledFn = + dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); + if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow()) + continue; + + // Skip call sites which already have a "funclet" bundle. + if (CS.getOperandBundle(LLVMContext::OB_funclet)) + continue; + + CS.getOperandBundlesAsDefs(OpBundles); + OpBundles.emplace_back("funclet", CallSiteEHPad); + + Instruction *NewInst; + if (CS.isCall()) + NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I); + else + NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I); + NewInst->setDebugLoc(I->getDebugLoc()); + NewInst->takeName(I); + I->replaceAllUsesWith(NewInst); + I->eraseFromParent(); + + OpBundles.clear(); + } + + Instruction *I = BB->getFirstNonPHI(); + if (!I->isEHPad()) + continue; + + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { + if (isa<ConstantTokenNone>(CatchSwitch->getParentPad())) + CatchSwitch->setParentPad(CallSiteEHPad); + } else { + auto *FPI = cast<FuncletPadInst>(I); + if (isa<ConstantTokenNone>(FPI->getParentPad())) + FPI->setParentPad(CallSiteEHPad); + } + } + } + // If we are inlining for an invoke instruction, we must make sure to rewrite // any call instructions into invoke instructions. - if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) - HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo); + if (auto *II = dyn_cast<InvokeInst>(TheCall)) { + BasicBlock *UnwindDest = II->getUnwindDest(); + Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); + if (isa<LandingPadInst>(FirstNonPHI)) { + HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } else { + HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } + } // Handle any inlined musttail call sites. In order for a new call site to be // musttail, the source of the clone and the inlined call site must have been @@ -1250,7 +1526,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // the calling basic block. if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) { // Move all of the instructions right before the call. - OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(), + OrigBB->getInstList().splice(TheCall->getIterator(), + FirstNewBlock->getInstList(), FirstNewBlock->begin(), FirstNewBlock->end()); // Remove the cloned basic block. Caller->getBasicBlockList().pop_back(); @@ -1297,15 +1574,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Split the basic block. This guarantees that no PHI nodes will have to be // updated due to new incoming edges, and make the invoke case more // symmetric to the call case. - AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest, - CalledFunc->getName()+".exit"); + AfterCallBB = + OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(), + CalledFunc->getName() + ".exit"); } else { // It's a call // If this is a call instruction, we need to split the basic block that // the call lives in. // - AfterCallBB = OrigBB->splitBasicBlock(TheCall, - CalledFunc->getName()+".exit"); + AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(), + CalledFunc->getName() + ".exit"); } // Change the branch that used to go to AfterCallBB to branch to the first @@ -1314,14 +1592,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, TerminatorInst *Br = OrigBB->getTerminator(); assert(Br && Br->getOpcode() == Instruction::Br && "splitBasicBlock broken!"); - Br->setOperand(0, FirstNewBlock); - + Br->setOperand(0, &*FirstNewBlock); // Now that the function is correct, make it a little bit nicer. In // particular, move the basic blocks inserted from the end of the function // into the space made by splitting the source basic block. - Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(), - FirstNewBlock, Caller->end()); + Caller->getBasicBlockList().splice(AfterCallBB->getIterator(), + Caller->getBasicBlockList(), FirstNewBlock, + Caller->end()); // Handle all of the return instructions that we just cloned in, and eliminate // any users of the original call/invoke instruction. @@ -1333,7 +1611,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // possible incoming values. if (!TheCall->use_empty()) { PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(), - AfterCallBB->begin()); + &AfterCallBB->front()); // Anything that used the result of the function call should now use the // PHI node as their operand. TheCall->replaceAllUsesWith(PHI); @@ -1350,7 +1628,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } - // Add a branch to the merge points and remove return instructions. DebugLoc Loc; for (unsigned i = 0, e = Returns.size(); i != e; ++i) { @@ -1413,7 +1690,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Splice the code entry block into calling block, right before the // unconditional branch. CalleeEntry->replaceAllUsesWith(OrigBB); // Update PHI nodes - OrigBB->getInstList().splice(Br, CalleeEntry->getInstList()); + OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList()); // Remove the unconditional branch. OrigBB->getInstList().erase(Br); diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 30edf3b7aae4..5687afa61e2a 100644 --- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -380,14 +380,10 @@ bool llvm::expandRemainder(BinaryOperator *Rem) { IRBuilder<> Builder(Rem); - Type *RemTy = Rem->getType(); - if (RemTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); - - unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - - if (RemTyBitWidth != 32 && RemTyBitWidth != 64) - llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); + assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported"); + assert((Rem->getType()->getIntegerBitWidth() == 32 || + Rem->getType()->getIntegerBitWidth() == 64) && + "Div of bitwidth other than 32 or 64 not supported"); // First prepare the sign if it's a signed remainder if (Rem->getOpcode() == Instruction::SRem) { @@ -401,7 +397,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) { // If we didn't actually generate an urem instruction, we're done // This happens for example if the input were constant. In this case the // Builder insertion point was unchanged - if (Rem == Builder.GetInsertPoint()) + if (Rem == Builder.GetInsertPoint().getNodePtrUnchecked()) return true; BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); @@ -440,14 +436,10 @@ bool llvm::expandDivision(BinaryOperator *Div) { IRBuilder<> Builder(Div); - Type *DivTy = Div->getType(); - if (DivTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); - - unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - - if (DivTyBitWidth != 32 && DivTyBitWidth != 64) - llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); + assert(!Div->getType()->isVectorTy() && "Div over vectors not supported"); + assert((Div->getType()->getIntegerBitWidth() == 32 || + Div->getType()->getIntegerBitWidth() == 64) && + "Div of bitwidth other than 32 or 64 not supported"); // First prepare the sign if it's a signed division if (Div->getOpcode() == Instruction::SDiv) { @@ -461,7 +453,7 @@ bool llvm::expandDivision(BinaryOperator *Div) { // If we didn't actually generate an udiv instruction, we're done // This happens for example if the input were constant. In this case the // Builder insertion point was unchanged - if (Div == Builder.GetInsertPoint()) + if (Div == Builder.GetInsertPoint().getNodePtrUnchecked()) return true; BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); @@ -492,15 +484,14 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) { "Trying to expand remainder from a non-remainder function"); Type *RemTy = Rem->getType(); - if (RemTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!RemTy->isVectorTy() && "Div over vectors not supported"); unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - if (RemTyBitWidth > 32) - llvm_unreachable("Div of bitwidth greater than 32 not supported"); + assert(RemTyBitWidth <= 32 && + "Div of bitwidth greater than 32 not supported"); - if (RemTyBitWidth == 32) + if (RemTyBitWidth == 32) return expandRemainder(Rem); // If bitwidth smaller than 32 extend inputs, extend output and proceed @@ -542,15 +533,13 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) { "Trying to expand remainder from a non-remainder function"); Type *RemTy = Rem->getType(); - if (RemTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!RemTy->isVectorTy() && "Div over vectors not supported"); unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - if (RemTyBitWidth > 64) - llvm_unreachable("Div of bitwidth greater than 64 not supported"); + assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported"); - if (RemTyBitWidth == 64) + if (RemTyBitWidth == 64) return expandRemainder(Rem); // If bitwidth smaller than 64 extend inputs, extend output and proceed @@ -593,13 +582,11 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) { "Trying to expand division from a non-division function"); Type *DivTy = Div->getType(); - if (DivTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!DivTy->isVectorTy() && "Div over vectors not supported"); unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - if (DivTyBitWidth > 32) - llvm_unreachable("Div of bitwidth greater than 32 not supported"); + assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported"); if (DivTyBitWidth == 32) return expandDivision(Div); @@ -643,13 +630,12 @@ bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) { "Trying to expand division from a non-division function"); Type *DivTy = Div->getType(); - if (DivTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!DivTy->isVectorTy() && "Div over vectors not supported"); unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - if (DivTyBitWidth > 64) - llvm_unreachable("Div of bitwidth greater than 64 not supported"); + assert(DivTyBitWidth <= 64 && + "Div of bitwidth greater than 64 not supported"); if (DivTyBitWidth == 64) return expandDivision(Div); diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 9d40b6989d6e..b4b2e148dfbb 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -31,8 +31,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -64,6 +66,13 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, PredIteratorCache &PredCache, LoopInfo *LI) { SmallVector<Use *, 16> UsesToRewrite; + // Tokens cannot be used in PHI nodes, so we skip over them. + // We can run into tokens which are live out of a loop with catchswitch + // instructions in Windows EH if the catchswitch has one catchpad which + // is inside the loop and another which is not. + if (Inst.getType()->isTokenTy()) + return false; + BasicBlock *InstBB = Inst.getParent(); for (Use &U : Inst.uses()) { @@ -84,9 +93,8 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Invoke instructions are special in that their result value is not available // along their unwind edge. The code below tests to see whether DomBB - // dominates - // the value, so adjust DomBB to the normal destination block, which is - // effectively where the value is first usable. + // dominates the value, so adjust DomBB to the normal destination block, + // which is effectively where the value is first usable. BasicBlock *DomBB = Inst.getParent(); if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst)) DomBB = Inv->getNormalDest(); @@ -101,10 +109,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Insert the LCSSA phi's into all of the exit blocks dominated by the // value, and add them to the Phi's map. - for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(), - BBE = ExitBlocks.end(); - BBI != BBE; ++BBI) { - BasicBlock *ExitBB = *BBI; + for (BasicBlock *ExitBB : ExitBlocks) { if (!DT.dominates(DomNode, DT.getNode(ExitBB))) continue; @@ -113,7 +118,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, continue; PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB), - Inst.getName() + ".lcssa", ExitBB->begin()); + Inst.getName() + ".lcssa", &ExitBB->front()); // Add inputs from inside the loop for this PHI. for (BasicBlock *Pred : PredCache.get(ExitBB)) { @@ -148,26 +153,26 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Rewrite all uses outside the loop in terms of the new PHIs we just // inserted. - for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) { + for (Use *UseToRewrite : UsesToRewrite) { // If this use is in an exit block, rewrite to use the newly inserted PHI. // This is required for correctness because SSAUpdate doesn't handle uses in // the same block. It assumes the PHI we inserted is at the end of the // block. - Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser()); + Instruction *User = cast<Instruction>(UseToRewrite->getUser()); BasicBlock *UserBB = User->getParent(); if (PHINode *PN = dyn_cast<PHINode>(User)) - UserBB = PN->getIncomingBlock(*UsesToRewrite[i]); + UserBB = PN->getIncomingBlock(*UseToRewrite); if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { // Tell the VHs that the uses changed. This updates SCEV's caches. - if (UsesToRewrite[i]->get()->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin()); - UsesToRewrite[i]->set(UserBB->begin()); + if (UseToRewrite->get()->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front()); + UseToRewrite->set(&UserBB->front()); continue; } // Otherwise, do full PHI insertion. - SSAUpdate.RewriteUse(*UsesToRewrite[i]); + SSAUpdate.RewriteUse(*UseToRewrite); } // Post process PHI instructions that were inserted into another disjoint loop @@ -190,10 +195,9 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, } // Remove PHI nodes that did not have any uses rewritten. - for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) { - if (AddedPHIs[i]->use_empty()) - AddedPHIs[i]->eraseFromParent(); - } + for (PHINode *PN : AddedPHIs) + if (PN->use_empty()) + PN->eraseFromParent(); return true; } @@ -205,8 +209,8 @@ blockDominatesAnExit(BasicBlock *BB, DominatorTree &DT, const SmallVectorImpl<BasicBlock *> &ExitBlocks) { DomTreeNode *DomNode = DT.getNode(BB); - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (DT.dominates(DomNode, DT.getNode(ExitBlocks[i]))) + for (BasicBlock *ExitBB : ExitBlocks) + if (DT.dominates(DomNode, DT.getNode(ExitBB))) return true; return false; @@ -227,25 +231,22 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, // Look at all the instructions in the loop, checking to see if they have uses // outside the loop. If so, rewrite those uses. - for (Loop::block_iterator BBI = L.block_begin(), BBE = L.block_end(); - BBI != BBE; ++BBI) { - BasicBlock *BB = *BBI; - + for (BasicBlock *BB : L.blocks()) { // For large loops, avoid use-scanning by using dominance information: In // particular, if a block does not dominate any of the loop exits, then none // of the values defined in the block could be used outside the loop. if (!blockDominatesAnExit(BB, DT, ExitBlocks)) continue; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Reject two common cases fast: instructions with no uses (like stores) // and instructions with one use that is in the same block as this. - if (I->use_empty() || - (I->hasOneUse() && I->user_back()->getParent() == BB && - !isa<PHINode>(I->user_back()))) + if (I.use_empty() || + (I.hasOneUse() && I.user_back()->getParent() == BB && + !isa<PHINode>(I.user_back()))) continue; - Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache, LI); + Changed |= processInstruction(L, I, DT, ExitBlocks, PredCache, LI); } } @@ -266,8 +267,8 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, bool Changed = false; // Recurse depth-first through inner loops. - for (Loop::iterator I = L.begin(), E = L.end(); I != E; ++I) - Changed |= formLCSSARecursively(**I, DT, LI, SE); + for (Loop *SubLoop : L.getSubLoops()) + Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE); Changed |= formLCSSA(L, DT, LI, SE); return Changed; @@ -296,8 +297,10 @@ struct LCSSA : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); } }; } @@ -306,6 +309,8 @@ char LCSSA::ID = 0; INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) Pass *llvm::createLCSSAPass() { return new LCSSA(); } @@ -317,7 +322,8 @@ bool LCSSA::runOnFunction(Function &F) { bool Changed = false; LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = getAnalysisIfAvailable<ScalarEvolution>(); + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + SE = SEWP ? &SEWP->getSE() : nullptr; // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index ba8af47b54e1..e75163f323df 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -17,10 +17,11 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -188,9 +189,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, BasicBlock *BB = SI->getParent(); // Remove entries from PHI nodes which we no longer branch to... - for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { + for (BasicBlock *Succ : SI->successors()) { // Found case matching a constant operand? - BasicBlock *Succ = SI->getSuccessor(i); if (Succ == TheOnlyDest) TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest else @@ -230,6 +230,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, SIDef->getValue().getZExtValue())); } + // Update make.implicit metadata to the newly-created conditional branch. + MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit); + if (MakeImplicitMD) + NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD); + // Delete the old switch. SI->eraseFromParent(); return true; @@ -283,8 +288,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI) { if (!I->use_empty() || isa<TerminatorInst>(I)) return false; - // We don't want the landingpad instruction removed by anything this general. - if (isa<LandingPadInst>(I)) + // We don't want the landingpad-like instructions removed by anything this + // general. + if (I->isEHPad()) return false; // We don't want debug info removed by anything this general, unless @@ -414,6 +420,49 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, return false; } +static bool +simplifyAndDCEInstruction(Instruction *I, + SmallSetVector<Instruction *, 16> &WorkList, + const DataLayout &DL, + const TargetLibraryInfo *TLI) { + if (isInstructionTriviallyDead(I, TLI)) { + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty() || I == OpV) + continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast<Instruction>(OpV)) + if (isInstructionTriviallyDead(OpI, TLI)) + WorkList.insert(OpI); + } + + I->eraseFromParent(); + + return true; + } + + if (Value *SimpleV = SimplifyInstruction(I, DL)) { + // Add the users to the worklist. CAREFUL: an instruction can use itself, + // in the case of a phi node. + for (User *U : I->users()) + if (U != I) + WorkList.insert(cast<Instruction>(U)); + + // Replace the instruction with its simplified value. + I->replaceAllUsesWith(SimpleV); + I->eraseFromParent(); + return true; + } + return false; +} + /// SimplifyInstructionsInBlock - Scan the specified basic block and try to /// simplify any instructions in it and recursively delete dead instructions. /// @@ -422,30 +471,34 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI) { bool MadeChange = false; + const DataLayout &DL = BB->getModule()->getDataLayout(); #ifndef NDEBUG // In debug builds, ensure that the terminator of the block is never replaced // or deleted by these simplifications. The idea of simplification is that it // cannot introduce new instructions, and there is no way to replace the // terminator of a block without introducing a new instruction. - AssertingVH<Instruction> TerminatorVH(--BB->end()); + AssertingVH<Instruction> TerminatorVH(&BB->back()); #endif - for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) { + SmallSetVector<Instruction *, 16> WorkList; + // Iterate over the original function, only adding insts to the worklist + // if they actually need to be revisited. This avoids having to pre-init + // the worklist with the entire function's worth of instructions. + for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) { assert(!BI->isTerminator()); - Instruction *Inst = BI++; + Instruction *I = &*BI; + ++BI; - WeakVH BIHandle(BI); - if (recursivelySimplifyInstruction(Inst, TLI)) { - MadeChange = true; - if (BIHandle != BI) - BI = BB->begin(); - continue; - } + // We're visiting this instruction now, so make sure it's not in the + // worklist from an earlier visit. + if (!WorkList.count(I)) + MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); + } - MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); - if (BIHandle != BI) - BI = BB->begin(); + while (!WorkList.empty()) { + Instruction *I = WorkList.pop_back_val(); + MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); } return MadeChange; } @@ -808,7 +861,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { // Copy over any phi, debug or lifetime instruction. BB->getTerminator()->eraseFromParent(); - Succ->getInstList().splice(Succ->getFirstNonPHI(), BB->getInstList()); + Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(), + BB->getInstList()); } else { while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) { // We explicitly check for such uses in CanPropagatePredecessorsForPHIs. @@ -1017,8 +1071,13 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (LdStHasDebugValue(DIVar, LI)) return true; - Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr, - DDI->getDebugLoc(), LI); + // We are now tracking the loaded value instead of the address. In the + // future if multi-location support is added to the IR, it might be + // preferable to keep tracking both the loaded value and the original + // address in case the alloca can not be elided. + Instruction *DbgValue = Builder.insertDbgValueIntrinsic( + LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr); + DbgValue->insertAfter(LI); return true; } @@ -1034,8 +1093,8 @@ bool llvm::LowerDbgDeclare(Function &F) { DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<DbgDeclareInst *, 4> Dbgs; for (auto &FI : F) - for (BasicBlock::iterator BI : FI) - if (auto DDI = dyn_cast<DbgDeclareInst>(BI)) + for (Instruction &BI : FI) + if (auto DDI = dyn_cast<DbgDeclareInst>(&BI)) Dbgs.push_back(DDI); if (Dbgs.empty()) @@ -1060,9 +1119,13 @@ bool llvm::LowerDbgDeclare(Function &F) { // This is a call by-value or some other instruction that // takes a pointer to the variable. Insert a *value* // intrinsic that describes the alloca. + SmallVector<uint64_t, 1> NewDIExpr; + auto *DIExpr = DDI->getExpression(); + NewDIExpr.push_back(dwarf::DW_OP_deref); + NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(), - DDI->getExpression(), DDI->getDebugLoc(), - CI); + DIB.createExpression(NewDIExpr), + DDI->getDebugLoc(), CI); } DDI->eraseFromParent(); } @@ -1082,9 +1145,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { return nullptr; } -bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder, bool Deref) { - DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); +bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, + Instruction *InsertBefore, DIBuilder &Builder, + bool Deref, int Offset) { + DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address); if (!DDI) return false; DebugLoc Loc = DDI->getDebugLoc(); @@ -1092,29 +1156,40 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); - if (Deref) { + if (Deref || Offset) { // Create a copy of the original DIDescriptor for user variable, prepending // "deref" operation to a list of address elements, as new llvm.dbg.declare // will take a value storing address of the memory for variable, not // alloca itself. SmallVector<uint64_t, 4> NewDIExpr; - NewDIExpr.push_back(dwarf::DW_OP_deref); + if (Deref) + NewDIExpr.push_back(dwarf::DW_OP_deref); + if (Offset > 0) { + NewDIExpr.push_back(dwarf::DW_OP_plus); + NewDIExpr.push_back(Offset); + } else if (Offset < 0) { + NewDIExpr.push_back(dwarf::DW_OP_minus); + NewDIExpr.push_back(-Offset); + } if (DIExpr) NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); DIExpr = Builder.createExpression(NewDIExpr); } - // Insert llvm.dbg.declare in the same basic block as the original alloca, - // and remove old llvm.dbg.declare. - BasicBlock *BB = AI->getParent(); - Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, BB); + // Insert llvm.dbg.declare immediately after the original alloca, and remove + // old llvm.dbg.declare. + Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore); DDI->eraseFromParent(); return true; } -/// changeToUnreachable - Insert an unreachable instruction before the specified -/// instruction, making it and the rest of the code in the block dead. -static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { +bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, + DIBuilder &Builder, bool Deref, int Offset) { + return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, + Deref, Offset); +} + +void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) { BasicBlock *BB = I->getParent(); // Loop over all of the successors, removing BB's entry from any PHI // nodes. @@ -1132,7 +1207,7 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { new UnreachableInst(I->getContext(), I); // All instructions after this are dead. - BasicBlock::iterator BBI = I, BBE = BB->end(); + BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end(); while (BBI != BBE) { if (!BBI->use_empty()) BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); @@ -1142,8 +1217,11 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { /// changeToCall - Convert the specified invoke into a normal call. static void changeToCall(InvokeInst *II) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); + SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); + SmallVector<OperandBundleDef, 1> OpBundles; + II->getOperandBundlesAsDefs(OpBundles); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles, + "", II); NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); @@ -1162,7 +1240,7 @@ static bool markAliveBlocks(Function &F, SmallPtrSetImpl<BasicBlock*> &Reachable) { SmallVector<BasicBlock*, 128> Worklist; - BasicBlock *BB = F.begin(); + BasicBlock *BB = &F.front(); Worklist.push_back(BB); Reachable.insert(BB); bool Changed = false; @@ -1187,7 +1265,7 @@ static bool markAliveBlocks(Function &F, if (MakeUnreachable) { // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); + changeToUnreachable(&*BBI, false); Changed = true; break; } @@ -1201,7 +1279,7 @@ static bool markAliveBlocks(Function &F, ++BBI; if (!isa<UnreachableInst>(BBI)) { // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); + changeToUnreachable(&*BBI, false); Changed = true; } break; @@ -1253,6 +1331,40 @@ static bool markAliveBlocks(Function &F, return Changed; } +void llvm::removeUnwindEdge(BasicBlock *BB) { + TerminatorInst *TI = BB->getTerminator(); + + if (auto *II = dyn_cast<InvokeInst>(TI)) { + changeToCall(II); + return; + } + + TerminatorInst *NewTI; + BasicBlock *UnwindDest; + + if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) { + NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI); + UnwindDest = CRI->getUnwindDest(); + } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) { + auto *NewCatchSwitch = CatchSwitchInst::Create( + CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(), + CatchSwitch->getName(), CatchSwitch); + for (BasicBlock *PadBB : CatchSwitch->handlers()) + NewCatchSwitch->addHandler(PadBB); + + NewTI = NewCatchSwitch; + UnwindDest = CatchSwitch->getUnwindDest(); + } else { + llvm_unreachable("Could not find unwind successor"); + } + + NewTI->takeName(TI); + NewTI->setDebugLoc(TI->getDebugLoc()); + UnwindDest->removePredecessor(BB); + TI->replaceAllUsesWith(NewTI); + TI->eraseFromParent(); +} + /// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even /// if they are in a dead cycle. Return true if a change was made, false /// otherwise. @@ -1270,17 +1382,18 @@ bool llvm::removeUnreachableBlocks(Function &F) { // Loop over all of the basic blocks that are not reachable, dropping all of // their internal references... for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { - if (Reachable.count(BB)) + if (Reachable.count(&*BB)) continue; - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE; + ++SI) if (Reachable.count(*SI)) - (*SI)->removePredecessor(BB); + (*SI)->removePredecessor(&*BB); BB->dropAllReferences(); } for (Function::iterator I = ++F.begin(); I != F.end();) - if (!Reachable.count(I)) + if (!Reachable.count(&*I)) I = F.getBasicBlockList().erase(I); else ++I; @@ -1288,9 +1401,10 @@ bool llvm::removeUnreachableBlocks(Function &F) { return true; } -void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs) { +void llvm::combineMetadata(Instruction *K, const Instruction *J, + ArrayRef<unsigned> KnownIDs) { SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; - K->dropUnknownMetadata(KnownIDs); + K->dropUnknownNonDebugMetadata(KnownIDs); K->getAllMetadataOtherThanDebugLoc(Metadata); for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { unsigned Kind = Metadata[i].first; @@ -1326,8 +1440,29 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign // Only set the !nonnull if it is present in both instructions. K->setMetadata(Kind, JMD); break; + case LLVMContext::MD_invariant_group: + // Preserve !invariant.group in K. + break; + case LLVMContext::MD_align: + K->setMetadata(Kind, + MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); + break; + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + K->setMetadata(Kind, + MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); + break; } } + // Set !invariant.group from J if J has it. If both instructions have it + // then we will just pick it from J - even when they are different. + // Also make sure that K is load or store - f.e. combining bitcast with load + // could produce bitcast with invariant.group metadata, which is invalid. + // FIXME: we should try to preserve both invariant.group md if they are + // different, but right now instruction can only have one invariant.group. + if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group)) + if (isa<LoadInst>(K) || isa<StoreInst>(K)) + K->setMetadata(LLVMContext::MD_invariant_group, JMD); } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, @@ -1349,3 +1484,40 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, } return Count; } + +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, + DominatorTree &DT, + const BasicBlock *BB) { + assert(From->getType() == To->getType()); + + unsigned Count = 0; + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE;) { + Use &U = *UI++; + auto *I = cast<Instruction>(U.getUser()); + if (DT.dominates(BB, I->getParent())) { + U.set(To); + DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " + << *To << " in " << *U << "\n"); + ++Count; + } + } + return Count; +} + +bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { + if (isa<IntrinsicInst>(CS.getInstruction())) + // Most LLVM intrinsics are things which can never take a safepoint. + // As a result, we don't need to have the stack parsable at the + // callsite. This is a highly useful optimization since intrinsic + // calls are fairly prevalent, particularly in debug builds. + return true; + + // Check if the function is specifically marked as a gc leaf function. + // + // TODO: we should be checking the attributes on the call site as well. + if (const Function *F = CS.getCalledFunction()) + return F->hasFnAttribute("gc-leaf-function"); + + return false; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 5c98043e4632..1fa469595d16 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -44,11 +44,14 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -78,7 +81,7 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, SmallVectorImpl<BasicBlock *> &SplitPreds, Loop *L) { // Check to see if NewBB is already well placed. - Function::iterator BBI = NewBB; --BBI; + Function::iterator BBI = --NewBB->getIterator(); for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { if (&*BBI == SplitPreds[i]) return; @@ -92,9 +95,8 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, // block that neighbors a BB actually in the loop. BasicBlock *FoundBB = nullptr; for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { - Function::iterator BBI = SplitPreds[i]; - if (++BBI != NewBB->getParent()->end() && - L->contains(BBI)) { + Function::iterator BBI = SplitPreds[i]->getIterator(); + if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) { FoundBB = SplitPreds[i]; break; } @@ -112,17 +114,10 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, /// preheader, this method is called to insert one. This method has two phases: /// preheader insertion and analysis updating. /// -BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { +BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { BasicBlock *Header = L->getHeader(); - // Get analyses that we try to update. - auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>(); - auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>(); - auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - // Compute the set of predecessors of the loop that are not in the loop. SmallVector<BasicBlock*, 8> OutsideBlocks; for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); @@ -141,8 +136,10 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { // Split out the loop pre-header. BasicBlock *PreheaderBB; - PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", - AA, DT, LI, PreserveLCSSA); + PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT, + LI, PreserveLCSSA); + if (!PreheaderBB) + return nullptr; DEBUG(dbgs() << "LoopSimplify: Creating pre-header " << PreheaderBB->getName() << "\n"); @@ -159,8 +156,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { /// This method is used to split exit blocks that have predecessors outside of /// the loop. static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, - AliasAnalysis *AA, DominatorTree *DT, - LoopInfo *LI, Pass *PP) { + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { SmallVector<BasicBlock*, 8> LoopBlocks; for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { BasicBlock *P = *I; @@ -175,10 +172,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); BasicBlock *NewExitBB = nullptr; - bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - - NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT, - LI, PreserveLCSSA); + NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI, + PreserveLCSSA); + if (!NewExitBB) + return nullptr; DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " << NewExitBB->getName() << "\n"); @@ -206,8 +203,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, /// \brief The first part of loop-nestification is to find a PHI node that tells /// us how to partition the loops. -static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, - DominatorTree *DT, +static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT, AssumptionCache *AC) { const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { @@ -216,7 +212,6 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { // This is a degenerate PHI already, don't modify it! PN->replaceAllUsesWith(V); - if (AA) AA->deleteValue(PN); PN->eraseFromParent(); continue; } @@ -251,18 +246,18 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, /// created. /// static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, - AliasAnalysis *AA, DominatorTree *DT, - LoopInfo *LI, ScalarEvolution *SE, Pass *PP, + DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, bool PreserveLCSSA, AssumptionCache *AC) { // Don't try to separate loops without a preheader. if (!Preheader) return nullptr; // The header is not a landing pad; preheader insertion should ensure this. - assert(!L->getHeader()->isLandingPad() && - "Can't insert backedge to landing pad"); + BasicBlock *Header = L->getHeader(); + assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); - PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AC); + PHINode *PN = findPHIToPartitionLoops(L, DT, AC); if (!PN) return nullptr; // No known way to partition. // Pull out all predecessors that have varying values in the loop. This @@ -286,11 +281,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (SE) SE->forgetLoop(L); - bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - - BasicBlock *Header = L->getHeader(); BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", - AA, DT, LI, PreserveLCSSA); + DT, LI, PreserveLCSSA); // Make sure that NewBB is put someplace intelligent, which doesn't mess up // code layout too horribly. @@ -357,7 +349,6 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, /// and have that block branch to the loop header. This ensures that loops /// have exactly one backedge. static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI) { assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!"); @@ -369,8 +360,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, if (!Preheader) return nullptr; - // The header is not a landing pad; preheader insertion should ensure this. - assert(!Header->isLandingPad() && "Can't insert backedge to landing pad"); + // The header is not an EH pad; preheader insertion should ensure this. + assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); // Figure out which basic blocks contain back-edges to the loop header. std::vector<BasicBlock*> BackedgeBlocks; @@ -394,7 +385,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, << BEBlock->getName() << "\n"); // Move the new backedge block to right after the last backedge block. - Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos; + Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator(); F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock); // Now that the block has been inserted into the function, create PHI nodes in @@ -443,7 +434,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, // eliminate the PHI Node. if (HasUniqueIncomingValue) { NewPN->replaceAllUsesWith(UniqueValue); - if (AA) AA->deleteValue(NewPN); BEBlock->getInstList().erase(NewPN); } } @@ -470,15 +460,10 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, } /// \brief Simplify one loop and queue further loops for simplification. -/// -/// FIXME: Currently this accepts both lots of analyses that it uses and a raw -/// Pass pointer. The Pass pointer is used by numerous utilities to update -/// specific analyses. Rather than a pass it would be much cleaner and more -/// explicit if they accepted the analysis directly and then updated it. static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, Pass *PP, - AssumptionCache *AC) { + DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, AssumptionCache *AC, + bool PreserveLCSSA) { bool Changed = false; ReprocessLoop: @@ -544,7 +529,7 @@ ReprocessLoop: // Does the loop already have a preheader? If so, don't insert one. BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { - Preheader = InsertPreheaderForLoop(L, PP); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (Preheader) { ++NumInserted; Changed = true; @@ -568,7 +553,7 @@ ReprocessLoop: // Must be exactly this loop: no subloops, parent loops, or non-loop preds // allowed. if (!L->contains(*PI)) { - if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) { + if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) { ++NumInserted; Changed = true; } @@ -585,7 +570,7 @@ ReprocessLoop: // common backedge instead. if (L->getNumBackEdges() < 8) { if (Loop *OuterL = - separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP, AC)) { + separateNestedLoop(L, Preheader, DT, LI, SE, PreserveLCSSA, AC)) { ++NumNested; // Enqueue the outer loop as it should be processed next in our // depth-first nest walk. @@ -602,7 +587,7 @@ ReprocessLoop: // If we either couldn't, or didn't want to, identify nesting of the loops, // insert a new block that all backedges target, then make it jump to the // loop header. - LoopLatch = insertUniqueBackedgeBlock(L, Preheader, AA, DT, LI); + LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI); if (LoopLatch) { ++NumInserted; Changed = true; @@ -618,7 +603,6 @@ ReprocessLoop: for (BasicBlock::iterator I = L->getHeader()->begin(); (PN = dyn_cast<PHINode>(I++)); ) if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { - if (AA) AA->deleteValue(PN); if (SE) SE->forgetValue(PN); PN->replaceAllUsesWith(V); PN->eraseFromParent(); @@ -654,7 +638,7 @@ ReprocessLoop: bool AllInvariant = true; bool AnyInvariant = false; for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // Skip debug info intrinsics. if (isa<DbgInfoIntrinsic>(Inst)) continue; @@ -716,9 +700,9 @@ ReprocessLoop: return Changed; } -bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, - AliasAnalysis *AA, ScalarEvolution *SE, - AssumptionCache *AC) { +bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, AssumptionCache *AC, + bool PreserveLCSSA) { bool Changed = false; // Worklist maintains our depth-first queue of loops in this nest to process. @@ -734,8 +718,8 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, } while (!Worklist.empty()) - Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, - SE, PP, AC); + Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE, + AC, PreserveLCSSA); return Changed; } @@ -747,9 +731,6 @@ namespace { initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); } - // AA - If we have an alias analysis object to update, this is it, otherwise - // this is null. - AliasAnalysis *AA; DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; @@ -767,8 +748,11 @@ namespace { AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.addPreserved<DependenceAnalysis>(); AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. } @@ -784,6 +768,9 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) @@ -796,15 +783,16 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } /// bool LoopSimplify::runOnFunction(Function &F) { bool Changed = false; - AA = getAnalysisIfAvailable<AliasAnalysis>(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = getAnalysisIfAvailable<ScalarEvolution>(); + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + SE = SEWP ? &SEWP->getSE() : nullptr; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, AC); + Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA); return Changed; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 1dbce4746835..2499b88741fe 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -73,7 +73,7 @@ static inline void RemapInstruction(Instruction *I, /// of loops that have already been forgotten to prevent redundant, expensive /// calls to ScalarEvolution::forgetLoop. Returns the new combined block. static BasicBlock * -FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, +FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE, SmallPtrSetImpl<Loop *> &ForgottenLoops) { // Merge basic blocks into their predecessor if there is only one distinct // pred, and if there is only one distinct successor of the predecessor, and @@ -109,12 +109,10 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, // Erase basic block from the function... // ScalarEvolution holds references to loop exit blocks. - if (LPM) { - if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { - if (Loop *L = LI->getLoopFor(BB)) { - if (ForgottenLoops.insert(L).second) - SE->forgetLoop(L); - } + if (SE) { + if (Loop *L = LI->getLoopFor(BB)) { + if (ForgottenLoops.insert(L).second) + SE->forgetLoop(L); } } LI->removeBlock(BB); @@ -155,15 +153,13 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, /// /// The LoopInfo Analysis that is passed will be kept consistent. /// -/// If a LoopPassManager is passed in, and the loop is fully removed, it will be -/// removed from the LoopPassManager as well. LPM can also be NULL. -/// -/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are -/// available from the Pass it must also preserve those analyses. +/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and +/// DominatorTree if they are non-null. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, Pass *PP, - LPPassManager *LPM, AssumptionCache *AC) { + unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, AssumptionCache *AC, + bool PreserveLCSSA) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -220,6 +216,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; + SmallVector<BasicBlock *, 4> ExitBlocks; + L->getExitBlocks(ExitBlocks); + Loop *ParentL = L->getParentLoop(); + bool AllExitsAreInsideParentLoop = !ParentL || + std::all_of(ExitBlocks.begin(), ExitBlocks.end(), + [&](BasicBlock *BB) { return ParentL->contains(BB); }); // We assume a run-time trip count if the compiler cannot // figure out the loop trip count and the unroll-runtime @@ -227,13 +229,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); if (RuntimeTripCount && - !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, LPM)) + !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT, + PreserveLCSSA)) return false; // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. - ScalarEvolution *SE = - PP ? PP->getAnalysisIfAvailable<ScalarEvolution>() : nullptr; if (SE) SE->forgetLoop(L); @@ -392,7 +393,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, for (unsigned i = 0; i < NewBlocks.size(); ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) - ::RemapInstruction(I, LastValueMap); + ::RemapInstruction(&*I, LastValueMap); } // Loop over the PHI nodes in the original block, setting incoming values. @@ -432,8 +433,9 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // For a complete unroll, make the last iteration end with a branch // to the exit block. - if (CompletelyUnroll && j == 0) { - Dest = LoopExit; + if (CompletelyUnroll) { + if (j == 0) + Dest = LoopExit; NeedConditional = false; } @@ -473,7 +475,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); if (Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); - if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM, + if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops)) std::replace(Latches.begin(), Latches.end(), Dest, Fold); } @@ -483,29 +485,24 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // whole function's cache. AC->clear(); - DominatorTree *DT = nullptr; - if (PP) { - // FIXME: Reconstruct dom info, because it is not preserved properly. - // Incrementally updating domtree after loop unrolling would be easy. - if (DominatorTreeWrapperPass *DTWP = - PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DT = &DTWP->getDomTree(); - DT->recalculate(*L->getHeader()->getParent()); - } - - // Simplify any new induction variables in the partially unrolled loop. - if (SE && !CompletelyUnroll) { - SmallVector<WeakVH, 16> DeadInsts; - simplifyLoopIVs(L, SE, LPM, DeadInsts); - - // Aggressively clean up dead instructions that simplifyLoopIVs already - // identified. Any remaining should be cleaned up below. - while (!DeadInsts.empty()) - if (Instruction *Inst = - dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) - RecursivelyDeleteTriviallyDeadInstructions(Inst); - } + // FIXME: Reconstruct dom info, because it is not preserved properly. + // Incrementally updating domtree after loop unrolling would be easy. + if (DT) + DT->recalculate(*L->getHeader()->getParent()); + + // Simplify any new induction variables in the partially unrolled loop. + if (SE && !CompletelyUnroll) { + SmallVector<WeakVH, 16> DeadInsts; + simplifyLoopIVs(L, SE, DT, LI, DeadInsts); + + // Aggressively clean up dead instructions that simplifyLoopIVs already + // identified. Any remaining should be cleaned up below. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); } + // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. @@ -514,7 +511,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(), BBE = NewLoopBlocks.end(); BB != BBE; ++BB) for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; if (isInstructionTriviallyDead(Inst)) (*BB)->getInstList().erase(Inst); @@ -529,29 +526,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, ++NumUnrolled; Loop *OuterL = L->getParentLoop(); - // Remove the loop from the LoopPassManager if it's completely removed. - if (CompletelyUnroll && LPM != nullptr) - LPM->deleteLoopFromQueue(L); + // Update LoopInfo if the loop is completely removed. + if (CompletelyUnroll) + LI->updateUnloop(L);; // If we have a pass and a DominatorTree we should re-simplify impacted loops // to ensure subsequent analyses can rely on this form. We want to simplify // at least one layer outside of the loop that was unrolled so that any // changes to the parent loop exposed by the unrolling are considered. - if (PP && DT) { + if (DT) { if (!OuterL && !CompletelyUnroll) OuterL = L; if (OuterL) { - simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, AC); + bool Simplified = simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after - // deleteLoopFromQueue updates LoopInfo. + // LoopInfo's been updated by updateUnloop. Loop *LatchLoop = LI->getLoopFor(Latches.back()); if (!OuterL->contains(LatchLoop)) while (OuterL->getParentLoop() != LatchLoop) OuterL = OuterL->getParentLoop(); - formLCSSARecursively(*OuterL, *DT, LI, SE); + if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified)) + formLCSSARecursively(*OuterL, *DT, LI, SE); + else + assert(OuterL->isLCSSAForm(*DT) && + "Loops should be in LCSSA form after loop-unroll."); } } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index add5432aa276..0d68f18ad0e5 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -62,8 +62,8 @@ STATISTIC(NumRuntimeUnrolled, static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &VMap, AliasAnalysis *AA, - DominatorTree *DT, LoopInfo *LI, Pass *P) { + ValueToValueMapTy &VMap, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); @@ -127,8 +127,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); - SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI, - P->mustPreserveAnalysisID(LCSSAID)); + SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI, + PreserveLCSSA); // Add the branch to the exit block (around the unrolled loop) B.CreateCondBr(BrLoopExit, Exit, NewPH); InsertPt->eraseFromParent(); @@ -150,7 +150,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, Function *F = Header->getParent(); LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); - Loop *NewLoop = 0; + Loop *NewLoop = nullptr; Loop *ParentLoop = L->getParentLoop(); if (!UnrollProlog) { NewLoop = new Loop(); @@ -206,9 +206,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, // Change the incoming values to the ones defined in the preheader or // cloned loop. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *NewPHI = cast<PHINode>(VMap[I]); + PHINode *NewPHI = cast<PHINode>(VMap[&*I]); if (UnrollProlog) { - VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); + VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); } else { unsigned idx = NewPHI->getBasicBlockIndex(Preheader); @@ -279,7 +279,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, bool AllowExpensiveTripCount, LoopInfo *LI, - LPPassManager *LPM) { + ScalarEvolution *SE, DominatorTree *DT, + bool PreserveLCSSA) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; @@ -291,9 +292,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification - if (!LPM) - return false; - ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (!SE) return false; @@ -308,7 +306,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = - SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; @@ -333,10 +331,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); - // Grab analyses that we preserve. - auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the @@ -397,8 +391,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, VMap, LI); // Insert the cloned blocks into function just before the original loop - F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], - F->end()); + F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(), + NewBlocks[0]->getIterator(), F->end()); // Rewrite the cloned instruction operands to use the values // created when the clone is created. @@ -406,7 +400,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { - RemapInstruction(I, VMap, + RemapInstruction(&*I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } @@ -414,8 +408,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, // Connect the prolog code to the original loop and update the // PHI functions. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); - ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, - /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); + ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI, + PreserveLCSSA); NumRuntimeUnrolled++; return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp index 5cbde94a98ed..e03880526bfa 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -12,13 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -34,6 +34,124 @@ bool RecurrenceDescriptor::areAllUsesIn(Instruction *I, return true; } +bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) { + switch (Kind) { + default: + break; + case RK_IntegerAdd: + case RK_IntegerMult: + case RK_IntegerOr: + case RK_IntegerAnd: + case RK_IntegerXor: + case RK_IntegerMinMax: + return true; + } + return false; +} + +bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) { + return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind); +} + +bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) { + switch (Kind) { + default: + break; + case RK_IntegerAdd: + case RK_IntegerMult: + case RK_FloatAdd: + case RK_FloatMult: + return true; + } + return false; +} + +Instruction * +RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, + SmallPtrSetImpl<Instruction *> &Visited, + SmallPtrSetImpl<Instruction *> &CI) { + if (!Phi->hasOneUse()) + return Phi; + + const APInt *M = nullptr; + Instruction *I, *J = cast<Instruction>(Phi->use_begin()->getUser()); + + // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT + // with a new integer type of the corresponding bit width. + if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)), + m_And(m_APInt(M), m_Instruction(I))))) { + int32_t Bits = (*M + 1).exactLogBase2(); + if (Bits > 0) { + RT = IntegerType::get(Phi->getContext(), Bits); + Visited.insert(Phi); + CI.insert(J); + return J; + } + } + return Phi; +} + +bool RecurrenceDescriptor::getSourceExtensionKind( + Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned, + SmallPtrSetImpl<Instruction *> &Visited, + SmallPtrSetImpl<Instruction *> &CI) { + + SmallVector<Instruction *, 8> Worklist; + bool FoundOneOperand = false; + unsigned DstSize = RT->getPrimitiveSizeInBits(); + Worklist.push_back(Exit); + + // Traverse the instructions in the reduction expression, beginning with the + // exit value. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (Use &U : I->operands()) { + + // Terminate the traversal if the operand is not an instruction, or we + // reach the starting value. + Instruction *J = dyn_cast<Instruction>(U.get()); + if (!J || J == Start) + continue; + + // Otherwise, investigate the operation if it is also in the expression. + if (Visited.count(J)) { + Worklist.push_back(J); + continue; + } + + // If the operand is not in Visited, it is not a reduction operation, but + // it does feed into one. Make sure it is either a single-use sign- or + // zero-extend instruction. + CastInst *Cast = dyn_cast<CastInst>(J); + bool IsSExtInst = isa<SExtInst>(J); + if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst)) + return false; + + // Ensure the source type of the extend is no larger than the reduction + // type. It is not necessary for the types to be identical. + unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); + if (SrcSize > DstSize) + return false; + + // Furthermore, ensure that all such extends are of the same kind. + if (FoundOneOperand) { + if (IsSigned != IsSExtInst) + return false; + } else { + FoundOneOperand = true; + IsSigned = IsSExtInst; + } + + // Lastly, if the source type of the extend matches the reduction type, + // add the extend to CI so that we can avoid accounting for it in the + // cost model. + if (SrcSize == DstSize) + CI.insert(Cast); + } + } + return true; +} + bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop, bool HasFunNoNaNAttr, RecurrenceDescriptor &RedDes) { @@ -68,10 +186,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, unsigned NumCmpSelectPatternInst = 0; InstDesc ReduxDesc(false, nullptr); + // Data used for determining if the recurrence has been type-promoted. + Type *RecurrenceType = Phi->getType(); + SmallPtrSet<Instruction *, 4> CastInsts; + Instruction *Start = Phi; + bool IsSigned = false; + SmallPtrSet<Instruction *, 8> VisitedInsts; SmallVector<Instruction *, 8> Worklist; - Worklist.push_back(Phi); - VisitedInsts.insert(Phi); + + // Return early if the recurrence kind does not match the type of Phi. If the + // recurrence kind is arithmetic, we attempt to look through AND operations + // resulting from the type promotion performed by InstCombine. Vector + // operations are not limited to the legal integer widths, so we may be able + // to evaluate the reduction in the narrower width. + if (RecurrenceType->isFloatingPointTy()) { + if (!isFloatingPointRecurrenceKind(Kind)) + return false; + } else { + if (!isIntegerRecurrenceKind(Kind)) + return false; + if (isArithmeticRecurrenceKind(Kind)) + Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts); + } + + Worklist.push_back(Start); + VisitedInsts.insert(Start); // A value in the reduction can be used: // - By the reduction: @@ -110,10 +250,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) return false; - // Any reduction instruction must be of one of the allowed kinds. - ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); - if (!ReduxDesc.isRecurrence()) - return false; + // Any reduction instruction must be of one of the allowed kinds. We ignore + // the starting value (the Phi or an AND instruction if the Phi has been + // type-promoted). + if (Cur != Start) { + ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); + if (!ReduxDesc.isRecurrence()) + return false; + } // A reduction operation must only have one use of the reduction value. if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && @@ -131,7 +275,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, ++NumCmpSelectPatternInst; // Check whether we found a reduction operator. - FoundReduxOp |= !IsAPhi; + FoundReduxOp |= !IsAPhi && Cur != Start; // Process users of current instruction. Push non-PHI nodes after PHI nodes // onto the stack. This way we are going to have seen all inputs to PHI @@ -193,6 +337,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; + // If we think Phi may have been type-promoted, we also need to ensure that + // all source operands of the reduction are either SExtInsts or ZEstInsts. If + // so, we will be able to evaluate the reduction in the narrower bit width. + if (Start != Phi) + if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType, + IsSigned, VisitedInsts, CastInsts)) + return false; + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -200,9 +352,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, - ReduxDesc.getMinMaxKind()); - + RecurrenceDescriptor RD( + RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(), + ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts); RedDes = RD; return true; @@ -263,14 +415,14 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, InstDesc &Prev, bool HasFunNoNaNAttr) { bool FP = I->getType()->isFloatingPointTy(); - bool FastMath = FP && I->hasUnsafeAlgebra(); + Instruction *UAI = Prev.getUnsafeAlgebraInst(); + if (!UAI && FP && !I->hasUnsafeAlgebra()) + UAI = I; // Found an unsafe (unvectorizable) algebra instruction. + switch (I->getOpcode()) { default: return InstDesc(false, I); case Instruction::PHI: - if (FP && - (Kind != RK_FloatMult && Kind != RK_FloatAdd && Kind != RK_FloatMinMax)) - return InstDesc(false, I); return InstDesc(I, Prev.getMinMaxKind()); case Instruction::Sub: case Instruction::Add: @@ -284,10 +436,10 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, case Instruction::Xor: return InstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: - return InstDesc(Kind == RK_FloatMult && FastMath, I); + return InstDesc(Kind == RK_FloatMult, I, UAI); case Instruction::FSub: case Instruction::FAdd: - return InstDesc(Kind == RK_FloatAdd && FastMath, I); + return InstDesc(Kind == RK_FloatAdd, I, UAI); case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: @@ -442,6 +594,13 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder, break; } + // We only match FP sequences with unsafe algebra, so we can unconditionally + // set it on any generated instructions. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + FastMathFlags FMF; + FMF.setUnsafeAlgebra(); + Builder.SetFastMathFlags(FMF); + Value *Cmp; if (RK == MRK_FloatMin || RK == MRK_FloatMax) Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); @@ -452,8 +611,54 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder, return Select; } -bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, - ConstantInt *&StepValue) { +InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, + ConstantInt *Step) + : StartValue(Start), IK(K), StepValue(Step) { + assert(IK != IK_NoInduction && "Not an induction"); + assert(StartValue && "StartValue is null"); + assert(StepValue && !StepValue->isZero() && "StepValue is zero"); + assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && + "StartValue is not a pointer for pointer induction"); + assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && + "StartValue is not an integer for integer induction"); + assert(StepValue->getType()->isIntegerTy() && + "StepValue is not an integer"); +} + +int InductionDescriptor::getConsecutiveDirection() const { + if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) + return StepValue->getSExtValue(); + return 0; +} + +Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index) const { + switch (IK) { + case IK_IntInduction: + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (StepValue->isMinusOne()) + return B.CreateSub(StartValue, Index); + if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateAdd(StartValue, Index); + + case IK_PtrInduction: + assert(Index->getType() == StepValue->getType() && + "Index type does not match StepValue type"); + if (StepValue->isMinusOne()) + Index = B.CreateNeg(Index); + else if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateGEP(nullptr, StartValue, Index); + + case IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); +} + +bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, + InductionDescriptor &D) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) @@ -467,6 +672,10 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, return false; } + assert(AR->getLoop()->getHeader() == Phi->getParent() && + "PHI is an AddRec for a different loop?!"); + Value *StartValue = + Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader()); const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); @@ -475,7 +684,7 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, ConstantInt *CV = C->getValue(); if (PhiTy->isIntegerTy()) { - StepValue = CV; + D = InductionDescriptor(StartValue, IK_IntInduction, CV); return true; } @@ -494,6 +703,27 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, int64_t CVSize = CV->getSExtValue(); if (CVSize % Size) return false; - StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + + D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue); return true; } + +/// \brief Returns the instructions that use values defined in the loop. +SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) { + SmallVector<Instruction *, 8> UsedOutside; + + for (auto *Block : L->getBlocks()) + // FIXME: I believe that this could use copy_if if the Inst reference could + // be adapted into a pointer. + for (auto &Inst : *Block) { + auto Users = Inst.users(); + if (std::any_of(Users.begin(), Users.end(), [&](User *U) { + auto *Use = cast<Instruction>(U); + return !L->contains(Use->getParent()); + })) + UsedOutside.push_back(&Inst); + } + + return UsedOutside; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 832079d2cf63..9a2a06cf6891 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -13,43 +13,81 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/Dominators.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopVersioning.h" using namespace llvm; LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, - DominatorTree *DT, - const SmallVector<int, 8> *PtrToPartition) - : VersionedLoop(L), NonVersionedLoop(nullptr), - PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) { + DominatorTree *DT, ScalarEvolution *SE, + bool UseLAIChecks) + : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT), + SE(SE) { assert(L->getExitBlock() && "No single exit block"); assert(L->getLoopPreheader() && "No preheader"); + if (UseLAIChecks) { + setAliasChecks(LAI.getRuntimePointerChecking()->getChecks()); + setSCEVChecks(LAI.PSE.getUnionPredicate()); + } } -bool LoopVersioning::needsRuntimeChecks() const { - return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition); +void LoopVersioning::setAliasChecks( + const SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) { + AliasChecks = std::move(Checks); } -void LoopVersioning::versionLoop(Pass *P) { +void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) { + Preds = std::move(Check); +} + +void LoopVersioning::versionLoop( + const SmallVectorImpl<Instruction *> &DefsUsedOutside) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; + Value *SCEVRuntimeCheck; + Value *RuntimeCheck = nullptr; + // Add the memcheck in the original preheader (this is empty initially). - BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader(); + BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); std::tie(FirstCheckInst, MemRuntimeCheck) = - LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition); + LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks); assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); + const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); + SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), + "scev.check"); + SCEVRuntimeCheck = + Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator()); + auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck); + + // Discard the SCEV runtime check if it is always true. + if (CI && CI->isZero()) + SCEVRuntimeCheck = nullptr; + + if (MemRuntimeCheck && SCEVRuntimeCheck) { + RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck, + SCEVRuntimeCheck, "ldist.safe"); + if (auto *I = dyn_cast<Instruction>(RuntimeCheck)) + I->insertBefore(RuntimeCheckBB->getTerminator()); + } else + RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; + + assert(RuntimeCheck && "called even though we don't need " + "any runtime checks"); + // Rename the block to make the IR more readable. - MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck"); + RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() + + ".lver.check"); // Create empty preheader for the loop (and after cloning for the // non-versioned loop). - BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); + BasicBlock *PH = + SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI); PH->setName(VersionedLoop->getHeader()->getName() + ".ph"); // Clone the loop including the preheader. @@ -58,20 +96,23 @@ void LoopVersioning::versionLoop(Pass *P) { // block is a join between the two loops. SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks; NonVersionedLoop = - cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig", - LI, DT, NonVersionedLoopBlocks); + cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap, + ".lver.orig", LI, DT, NonVersionedLoopBlocks); remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap); // Insert the conditional branch based on the result of the memchecks. - Instruction *OrigTerm = MemCheckBB->getTerminator(); + Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); BranchInst::Create(NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader(), MemRuntimeCheck, - OrigTerm); + VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the // memchecking block. - DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB); + DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB); + + // Adds the necessary PHI nodes for the versioned loops based on the + // loop-defined values used outside of the loop. + addPHINodes(DefsUsedOutside); } void LoopVersioning::addPHINodes( @@ -94,7 +135,7 @@ void LoopVersioning::addPHINodes( // If not create it. if (!PN) { PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver", - PHIBlock->begin()); + &PHIBlock->front()); for (auto *User : Inst->users()) if (!VersionedLoop->contains(cast<Instruction>(User)->getParent())) User->replaceUsesOfWith(Inst, PN); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp index 66d57b069fe7..b0ad4d5e84a1 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -69,7 +69,7 @@ bool LowerInvoke::runOnFunction(Function &F) { BranchInst::Create(II->getNormalDest(), II); // Remove any PHI node entries from the exception destination. - II->getUnwindDest()->removePredecessor(BB); + II->getUnwindDest()->removePredecessor(&*BB); // Remove the invoke instruction now. BB->getInstList().erase(II); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 4acd988691d2..52beb1542497 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -49,8 +49,7 @@ namespace { return I != Ranges.end() && I->Low <= R.Low; } - /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch - /// instructions. + /// Replace all SwitchInst instructions with chained branch instructions. class LowerSwitch : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid @@ -78,7 +77,7 @@ namespace { typedef std::vector<CaseRange> CaseVector; typedef std::vector<CaseRange>::iterator CaseItr; private: - void processSwitchInst(SwitchInst *SI); + void processSwitchInst(SwitchInst *SI, SmallPtrSetImpl<BasicBlock*> &DeleteList); BasicBlock *switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, @@ -116,21 +115,30 @@ FunctionPass *llvm::createLowerSwitchPass() { bool LowerSwitch::runOnFunction(Function &F) { bool Changed = false; + SmallPtrSet<BasicBlock*, 8> DeleteList; for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks + BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks + + // If the block is a dead Default block that will be deleted later, don't + // waste time processing it. + if (DeleteList.count(Cur)) + continue; if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) { Changed = true; - processSwitchInst(SI); + processSwitchInst(SI, DeleteList); } } + for (BasicBlock* BB: DeleteList) { + DeleteDeadBlock(BB); + } + return Changed; } -// operator<< - Used for debugging purposes. -// +/// Used for debugging purposes. static raw_ostream& operator<<(raw_ostream &O, const LowerSwitch::CaseVector &C) LLVM_ATTRIBUTE_USED; @@ -147,23 +155,24 @@ static raw_ostream& operator<<(raw_ostream &O, return O << "]"; } -// \brief Update the first occurrence of the "switch statement" BB in the PHI -// node with the "new" BB. The other occurrences will: -// -// 1) Be updated by subsequent calls to this function. Switch statements may -// have more than one outcoming edge into the same BB if they all have the same -// value. When the switch statement is converted these incoming edges are now -// coming from multiple BBs. -// 2) Removed if subsequent incoming values now share the same case, i.e., -// multiple outcome edges are condensed into one. This is necessary to keep the -// number of phi values equal to the number of branches to SuccBB. +/// \brief Update the first occurrence of the "switch statement" BB in the PHI +/// node with the "new" BB. The other occurrences will: +/// +/// 1) Be updated by subsequent calls to this function. Switch statements may +/// have more than one outcoming edge into the same BB if they all have the same +/// value. When the switch statement is converted these incoming edges are now +/// coming from multiple BBs. +/// 2) Removed if subsequent incoming values now share the same case, i.e., +/// multiple outcome edges are condensed into one. This is necessary to keep the +/// number of phi values equal to the number of branches to SuccBB. static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, unsigned NumMergedCases) { - for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI(); + for (BasicBlock::iterator I = SuccBB->begin(), + IE = SuccBB->getFirstNonPHI()->getIterator(); I != IE; ++I) { PHINode *PN = cast<PHINode>(I); - // Only update the first occurence. + // Only update the first occurrence. unsigned Idx = 0, E = PN->getNumIncomingValues(); unsigned LocalNumMergedCases = NumMergedCases; for (; Idx != E; ++Idx) { @@ -173,7 +182,7 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, } } - // Remove additional occurences coming from condensed cases and keep the + // Remove additional occurrences coming from condensed cases and keep the // number of incoming values equal to the number of branches to SuccBB. SmallVector<unsigned, 8> Indices; for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx) @@ -188,11 +197,11 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, } } -// switchConvert - Convert the switch statement into a binary lookup of -// the case values. The function recursively builds this tree. -// LowerBound and UpperBound are used to keep track of the bounds for Val -// that have already been checked by a block emitted by one of the previous -// calls to switchConvert in the call stack. +/// Convert the switch statement into a binary lookup of the case values. +/// The function recursively builds this tree. LowerBound and UpperBound are +/// used to keep track of the bounds for Val that have already been checked by +/// a block emitted by one of the previous calls to switchConvert in the call +/// stack. BasicBlock * LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, Value *Val, @@ -278,28 +287,24 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, UpperBound, Val, NewNode, OrigBlock, Default, UnreachableRanges); - Function::iterator FI = OrigBlock; - F->getBasicBlockList().insert(++FI, NewNode); + F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode); NewNode->getInstList().push_back(Comp); BranchInst::Create(LBranch, RBranch, Comp, NewNode); return NewNode; } -// newLeafBlock - Create a new leaf block for the binary lookup tree. It -// checks if the switch's value == the case's value. If not, then it -// jumps to the default branch. At this point in the tree, the value -// can't be another valid case value, so the jump to the "default" branch -// is warranted. -// +/// Create a new leaf block for the binary lookup tree. It checks if the +/// switch's value == the case's value. If not, then it jumps to the default +/// branch. At this point in the tree, the value can't be another valid case +/// value, so the jump to the "default" branch is warranted. BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, BasicBlock* OrigBlock, BasicBlock* Default) { Function* F = OrigBlock->getParent(); BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock"); - Function::iterator FI = OrigBlock; - F->getBasicBlockList().insert(++FI, NewLeaf); + F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf); // Emit comparison ICmpInst* Comp = nullptr; @@ -352,7 +357,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, return NewLeaf; } -// Clusterify - Transform simple list of Cases into list of CaseRange's +/// Transform simple list of Cases into list of CaseRange's. unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { unsigned numCmps = 0; @@ -394,10 +399,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { return numCmps; } -// processSwitchInst - Replace the specified switch instruction with a sequence -// of chained if-then insts in a balanced binary search. -// -void LowerSwitch::processSwitchInst(SwitchInst *SI) { +/// Replace the specified switch instruction with a sequence of chained if-then +/// insts in a balanced binary search. +void LowerSwitch::processSwitchInst(SwitchInst *SI, + SmallPtrSetImpl<BasicBlock*> &DeleteList) { BasicBlock *CurBlock = SI->getParent(); BasicBlock *OrigBlock = CurBlock; Function *F = CurBlock->getParent(); @@ -424,7 +429,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { std::vector<IntRange> UnreachableRanges; if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) { - // Make the bounds tightly fitted around the case value range, becase we + // Make the bounds tightly fitted around the case value range, because we // know that the value passed to the switch must be exactly one of the case // values. assert(!Cases.empty()); @@ -495,7 +500,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { // Create a new, empty default block so that the new hierarchy of // if-then statements go to this and the PHI nodes are happy. BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); - F->getBasicBlockList().insert(Default, NewDefault); + F->getBasicBlockList().insert(Default->getIterator(), NewDefault); BranchInst::Create(Default, NewDefault); // If there is an entry in any PHI nodes for the default edge, make sure @@ -518,7 +523,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { BasicBlock *OldDefault = SI->getDefaultDest(); CurBlock->getInstList().erase(SI); - // If the Default block has no more predecessors just remove it. + // If the Default block has no more predecessors just add it to DeleteList. if (pred_begin(OldDefault) == pred_end(OldDefault)) - DeleteDeadBlock(OldDefault); + DeleteList.insert(OldDefault); } diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp index 00cf4e6c01c8..aa1e35ddba02 100644 --- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -63,6 +63,9 @@ bool PromotePass::runOnFunction(Function &F) { BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return false; + bool Changed = false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 395a46bad97b..c999bd008fef 100644 --- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -42,6 +42,24 @@ namespace { } }; + static const char *const metaNames[] = { + // See http://en.wikipedia.org/wiki/Metasyntactic_variable + "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", + "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" + }; + + struct Renamer { + Renamer(unsigned int seed) { + prng.srand(seed); + } + + const char *newName() { + return metaNames[prng.rand() % array_lengthof(metaNames)]; + } + + PRNG prng; + }; + struct MetaRenamer : public ModulePass { static char ID; // Pass identification, replacement for typeid MetaRenamer() : ModulePass(ID) { @@ -53,36 +71,26 @@ namespace { } bool runOnModule(Module &M) override { - static const char *const metaNames[] = { - // See http://en.wikipedia.org/wiki/Metasyntactic_variable - "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", - "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" - }; - // Seed our PRNG with simple additive sum of ModuleID. We're looking to // simply avoid always having the same function names, and we need to // remain deterministic. unsigned int randSeed = 0; - for (std::string::const_iterator I = M.getModuleIdentifier().begin(), - E = M.getModuleIdentifier().end(); I != E; ++I) - randSeed += *I; + for (auto C : M.getModuleIdentifier()) + randSeed += C; - PRNG prng; - prng.srand(randSeed); + Renamer renamer(randSeed); // Rename all aliases - for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end(); - AI != AE; ++AI) { + for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) { StringRef Name = AI->getName(); if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) continue; AI->setName("alias"); } - + // Rename all global variables - for (Module::global_iterator GI = M.global_begin(), GE = M.global_end(); - GI != GE; ++GI) { + for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) { StringRef Name = GI->getName(); if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) continue; @@ -93,40 +101,37 @@ namespace { // Rename all struct types TypeFinder StructTypes; StructTypes.run(M, true); - for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { - StructType *STy = StructTypes[i]; + for (StructType *STy : StructTypes) { if (STy->isLiteral() || STy->getName().empty()) continue; SmallString<128> NameStorage; - STy->setName((Twine("struct.") + metaNames[prng.rand() % - array_lengthof(metaNames)]).toStringRef(NameStorage)); + STy->setName((Twine("struct.") + + renamer.newName()).toStringRef(NameStorage)); } // Rename all functions - for (Module::iterator FI = M.begin(), FE = M.end(); - FI != FE; ++FI) { - StringRef Name = FI->getName(); + for (auto &F : M) { + StringRef Name = F.getName(); if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) continue; - FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]); - runOnFunction(*FI); + F.setName(renamer.newName()); + runOnFunction(F); } return true; } bool runOnFunction(Function &F) { - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); - AI != AE; ++AI) + for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) if (!AI->getType()->isVoidTy()) AI->setName("arg"); - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - BB->setName("bb"); + for (auto &BB : F) { + BB.setName("bb"); - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (!I->getType()->isVoidTy()) - I->setName("tmp"); + for (auto &I : BB) + if (!I.getType()->isVoidTy()) + I.setName("tmp"); } return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp index d69a81ec4741..9ec28a3f3d47 100644 --- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -43,9 +43,9 @@ static void appendToGlobalArray(const char *Array, } GVCtor->eraseFromParent(); } else { - // Use a simple two-field struct if there isn't one already. + // Use the new three-field struct if there isn't one already. EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), - nullptr); + IRB.getInt8PtrTy(), nullptr); } // Build a 2 or 3 field global_ctor entry. We don't take a comdat key. @@ -107,7 +107,8 @@ Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) { std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions( Module &M, StringRef CtorName, StringRef InitName, - ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs) { + ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs, + StringRef VersionCheckName) { assert(!InitName.empty() && "Expected init function name"); assert(InitArgTypes.size() == InitArgTypes.size() && "Sanitizer's init function expects different number of arguments"); @@ -122,6 +123,13 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions( AttributeSet())); InitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(InitFunction, InitArgs); + if (!VersionCheckName.empty()) { + Function *VersionCheckFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false), + AttributeSet())); + IRB.CreateCall(VersionCheckFunction, {}); + } return std::make_pair(Ctor, InitFunction); } diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index a87f8504bfb5..c4f9b9f61407 100644 --- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -205,10 +205,9 @@ public: // avoid gratuitus rescans. const BasicBlock *BB = I->getParent(); unsigned InstNo = 0; - for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); BBI != E; - ++BBI) - if (isInterestingInstruction(BBI)) - InstNumbers[BBI] = InstNo++; + for (const Instruction &BBI : *BB) + if (isInterestingInstruction(&BBI)) + InstNumbers[&BBI] = InstNo++; It = InstNumbers.find(I); assert(It != InstNumbers.end() && "Didn't insert instruction?"); @@ -402,8 +401,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, // Record debuginfo for the store and remove the declaration's // debuginfo. if (DbgDeclareInst *DDI = Info.DbgDeclare) { - DIBuilder DIB(*AI->getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false); ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB); DDI->eraseFromParent(); LBI.deleteValue(DDI); @@ -425,14 +423,17 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, /// using the Alloca. /// /// If we cannot promote this alloca (because it is read before it is written), -/// return true. This is necessary in cases where, due to control flow, the -/// alloca is potentially undefined on some control flow paths. e.g. code like -/// this is potentially correct: -/// -/// for (...) { if (c) { A = undef; undef = B; } } -/// -/// ... so long as A is not used before undef is set. -static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, +/// return false. This is necessary in cases where, due to control flow, the +/// alloca is undefined only on some control flow paths. e.g. code like +/// this is correct in LLVM IR: +/// // A is an alloca with no stores so far +/// for (...) { +/// int t = *A; +/// if (!first_iteration) +/// use(t); +/// *A = 42; +/// } +static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI, AliasSetTracker *AST) { // The trickiest case to handle is when we have large blocks. Because of this, @@ -467,10 +468,15 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)), less_first()); - - if (I == StoresByIndex.begin()) - // If there is no store before this load, the load takes the undef value. - LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + if (I == StoresByIndex.begin()) { + if (StoresByIndex.empty()) + // If there are no stores, the load takes the undef value. + LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + else + // There is no store before this load, bail out (load may be affected + // by the following stores - see main comment). + return false; + } else // Otherwise, there was a store before this load, the load takes its value. LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0)); @@ -486,8 +492,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, StoreInst *SI = cast<StoreInst>(AI->user_back()); // Record debuginfo for the store before removing it. if (DbgDeclareInst *DDI = Info.DbgDeclare) { - DIBuilder DIB(*AI->getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false); ConvertDebugDeclareToDebugValue(DDI, SI, DIB); } SI->eraseFromParent(); @@ -506,6 +511,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, } ++NumLocalPromoted; + return true; } void PromoteMem2Reg::run() { @@ -557,9 +563,8 @@ void PromoteMem2Reg::run() { // If the alloca is only read and written in one basic block, just perform a // linear sweep over the block to eliminate it. - if (Info.OnlyUsedInOneBlock) { - promoteSingleBlockAlloca(AI, Info, LBI, AST); - + if (Info.OnlyUsedInOneBlock && + promoteSingleBlockAlloca(AI, Info, LBI, AST)) { // The alloca has been processed, move on. RemoveFromAllocasList(AllocaNum); continue; @@ -636,7 +641,7 @@ void PromoteMem2Reg::run() { // and inserting the phi nodes we marked as necessary // std::vector<RenamePassData> RenamePassWorkList; - RenamePassWorkList.emplace_back(F.begin(), nullptr, std::move(Values)); + RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values)); do { RenamePassData RPD; RPD.swap(RenamePassWorkList.back()); @@ -854,7 +859,7 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, // BasicBlock. PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB), Allocas[AllocaNo]->getName() + "." + Twine(Version++), - BB->begin()); + &BB->front()); ++NumPHIInsert; PhiToAllocaMap[PN] = AllocaNo; @@ -919,7 +924,7 @@ NextIteration: return; for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) { - Instruction *I = II++; // get the instruction, increment iterator + Instruction *I = &*II++; // get the instruction, increment iterator if (LoadInst *LI = dyn_cast<LoadInst>(I)) { AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 36781c1189cd..d0932f834cf5 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -43,7 +44,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <map> @@ -73,6 +73,22 @@ static cl::opt<bool> HoistCondStores( "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), cl::desc("Hoist conditional stores if an unconditional store precedes")); +static cl::opt<bool> MergeCondStores( + "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true), + cl::desc("Hoist conditional stores even if an unconditional store does not " + "precede - hoist multiple conditional stores into a single " + "predicated store")); + +static cl::opt<bool> MergeCondStoresAggressively( + "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false), + cl::desc("When merging conditional stores, do so even if the resultant " + "basic blocks are unlikely to be if-converted as a result")); + +static cl::opt<bool> SpeculateOneExpensiveInst( + "speculate-one-expensive-inst", cl::Hidden, cl::init(true), + cl::desc("Allow exactly one expensive instruction to be speculatively " + "executed")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -83,13 +99,13 @@ STATISTIC(NumSpeculations, "Number of speculative executed instructions"); namespace { // The first field contains the value that the switch produces when a certain - // case group is selected, and the second field is a vector containing the cases - // composing the case group. + // case group is selected, and the second field is a vector containing the + // cases composing the case group. typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2> SwitchCaseResultVectorTy; // The first field contains the phi node that generates a result of the switch - // and the second field contains the value generated for a certain case in the switch - // for that PHI. + // and the second field contains the value generated for a certain case in the + // switch for that PHI. typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy; /// ValueEqualityComparisonCase - Represents a case of a switch. @@ -124,6 +140,7 @@ class SimplifyCFGOpt { bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder); + bool SimplifyCleanupReturn(CleanupReturnInst *RI); bool SimplifyUnreachable(UnreachableInst *UI); bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder); bool SimplifyIndirectBr(IndirectBrInst *IBI); @@ -226,6 +243,7 @@ static unsigned ComputeSpeculationCost(const User *I, "Instruction is not safe to speculatively execute!"); return TTI.getUserCost(I); } + /// If we have a merge point of an "if condition" as accepted above, /// return true if the specified value dominates the block. We /// don't handle the true generality of domination here, just a special case @@ -246,7 +264,8 @@ static unsigned ComputeSpeculationCost(const User *I, static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSetImpl<Instruction*> *AggressiveInsts, unsigned &CostRemaining, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + unsigned Depth = 0) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -284,15 +303,24 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, unsigned Cost = ComputeSpeculationCost(I, TTI); - if (Cost > CostRemaining) + // Allow exactly one instruction to be speculated regardless of its cost + // (as long as it is safe to do so). + // This is intended to flatten the CFG even if the instruction is a division + // or other expensive operation. The speculation of an expensive instruction + // is expected to be undone in CodeGenPrepare if the speculation has not + // enabled further IR optimizations. + if (Cost > CostRemaining && + (!SpeculateOneExpensiveInst || !AggressiveInsts->empty() || Depth > 0)) return false; - CostRemaining -= Cost; + // Avoid unsigned wrap. + CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost; // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI, + Depth + 1)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -970,8 +998,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Okay, at this point, we know which new successor Pred will get. Make // sure we update the number of entries in the PHI nodes for these // successors. - for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i) - AddPredecessorToBlock(NewSuccessors[i], Pred, BB); + for (BasicBlock *NewSuccessor : NewSuccessors) + AddPredecessorToBlock(NewSuccessor, Pred, BB); Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. @@ -984,8 +1012,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault, PredCases.size()); NewSI->setDebugLoc(PTI->getDebugLoc()); - for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - NewSI->addCase(PredCases[i].Value, PredCases[i].Dest); + for (ValueEqualityComparisonCase &V : PredCases) + NewSI->addCase(V.Value, V.Dest); if (PredHasWeights || SuccHasWeights) { // Halve the weights if any of them cannot fit in an uint32_t @@ -1059,15 +1087,15 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, BasicBlock::iterator BB1_Itr = BB1->begin(); BasicBlock::iterator BB2_Itr = BB2->begin(); - Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++; + Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++; // Skip debug info if it is not identical. DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1); DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2); if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { while (isa<DbgInfoIntrinsic>(I1)) - I1 = BB1_Itr++; + I1 = &*BB1_Itr++; while (isa<DbgInfoIntrinsic>(I2)) - I2 = BB2_Itr++; + I2 = &*BB2_Itr++; } if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) || (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))) @@ -1088,31 +1116,30 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, // For a normal instruction, we just move one to right before the branch, // then replace all uses of the other with the first. Finally, we remove // the now redundant second instruction. - BIParent->getInstList().splice(BI, BB1->getInstList(), I1); + BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1); if (!I2->use_empty()) I2->replaceAllUsesWith(I1); I1->intersectOptionalDataWith(I2); unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_range, - LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_load, - LLVMContext::MD_nonnull - }; + LLVMContext::MD_tbaa, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_nonnull, LLVMContext::MD_invariant_group, + LLVMContext::MD_align, LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null}; combineMetadata(I1, I2, KnownIDs); I2->eraseFromParent(); Changed = true; - I1 = BB1_Itr++; - I2 = BB2_Itr++; + I1 = &*BB1_Itr++; + I2 = &*BB2_Itr++; // Skip debug info if it is not identical. DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1); DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2); if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { while (isa<DbgInfoIntrinsic>(I1)) - I1 = BB1_Itr++; + I1 = &*BB1_Itr++; while (isa<DbgInfoIntrinsic>(I2)) - I2 = BB2_Itr++; + I2 = &*BB2_Itr++; } } while (I1->isIdenticalToWhenDefined(I2)); @@ -1147,7 +1174,7 @@ HoistTerminator: // Okay, it is safe to hoist the terminator. Instruction *NT = I1->clone(); - BIParent->getInstList().insert(BI, NT); + BIParent->getInstList().insert(BI->getIterator(), NT); if (!NT->getType()->isVoidTy()) { I1->replaceAllUsesWith(NT); I2->replaceAllUsesWith(NT); @@ -1265,7 +1292,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // Cannot move control-flow-involving, volatile loads, vaarg, etc. if (isa<PHINode>(I1) || isa<PHINode>(I2) || isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) || - isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) || + I1->isEHPad() || I2->isEHPad() || isa<AllocaInst>(I1) || isa<AllocaInst>(I2) || I1->mayHaveSideEffects() || I2->mayHaveSideEffects() || I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() || @@ -1324,7 +1351,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { if (!NewPN) { NewPN = PHINode::Create(DifferentOp1->getType(), 2, - DifferentOp1->getName() + ".sink", BBEnd->begin()); + DifferentOp1->getName() + ".sink", &BBEnd->front()); NewPN->addIncoming(DifferentOp1, BB1); NewPN->addIncoming(DifferentOp2, BB2); DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";); @@ -1339,7 +1366,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // instruction in the basic block down. bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin()); // Sink the instruction. - BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1); + BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(), + BB1->getInstList(), I1); if (!OldPN->use_empty()) OldPN->replaceAllUsesWith(I1); OldPN->eraseFromParent(); @@ -1355,7 +1383,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { RE1 = BB1->getInstList().rend(); if (UpdateRE2) RE2 = BB2->getInstList().rend(); - FirstNonPhiInBBEnd = I1; + FirstNonPhiInBBEnd = &*I1; NumSinkCommons++; Changed = true; } @@ -1491,7 +1519,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, for (BasicBlock::iterator BBI = ThenBB->begin(), BBE = std::prev(ThenBB->end()); BBI != BBE; ++BBI) { - Instruction *I = BBI; + Instruction *I = &*BBI; // Skip debug info. if (isa<DbgInfoIntrinsic>(I)) continue; @@ -1604,9 +1632,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, SpeculatedStore->setOperand(0, S); } + // Metadata can be dependent on the condition we are hoisting above. + // Conservatively strip all metadata on the instruction. + for (auto &I: *ThenBB) + I.dropUnknownNonDebugMetadata(); + // Hoist the instructions. - BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(), - std::prev(ThenBB->end())); + BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), + ThenBB->begin(), std::prev(ThenBB->end())); // Insert selects and rewrite the PHI operands. IRBuilder<true, NoFolder> Builder(BI); @@ -1747,13 +1780,13 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) { // Check for trivial simplification. if (Value *V = SimplifyInstruction(N, DL)) { - TranslateMap[BBI] = V; + TranslateMap[&*BBI] = V; delete N; // Instruction folded away, don't need actual inst } else { // Insert the new instruction into its new home. EdgeBB->getInstList().insert(InsertPt, N); if (!BBI->use_empty()) - TranslateMap[BBI] = N; + TranslateMap[&*BBI] = N; } } @@ -1850,7 +1883,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } else { DomBlock = *pred_begin(IfBlock1); for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I) - if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { + if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) { // This is not an aggressive instruction that we can promote. // Because of this, we won't be able to get rid of the control // flow, so the xform is not worth it. @@ -1863,7 +1896,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } else { DomBlock = *pred_begin(IfBlock2); for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I) - if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { + if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) { // This is not an aggressive instruction that we can promote. // Because of this, we won't be able to get rid of the control // flow, so the xform is not worth it. @@ -1882,13 +1915,13 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // Move all 'aggressive' instructions, which are defined in the // conditional parts of the if's up to the dominating block. if (IfBlock1) - DomBlock->getInstList().splice(InsertPt, + DomBlock->getInstList().splice(InsertPt->getIterator(), IfBlock1->getInstList(), IfBlock1->begin(), - IfBlock1->getTerminator()); + IfBlock1->getTerminator()->getIterator()); if (IfBlock2) - DomBlock->getInstList().splice(InsertPt, + DomBlock->getInstList().splice(InsertPt->getIterator(), IfBlock2->getInstList(), IfBlock2->begin(), - IfBlock2->getTerminator()); + IfBlock2->getTerminator()->getIterator()); while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { // Change the PHI node into a select instruction. @@ -2057,7 +2090,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { BI->getSuccessor(0) == PBI->getSuccessor(1))) { for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Curr = I++; + Instruction *Curr = &*I++; if (isa<CmpInst>(Curr)) { Cond = Curr; break; @@ -2077,7 +2110,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { return false; // Make sure the instruction after the condition is the cond branch. - BasicBlock::iterator CondIt = Cond; ++CondIt; + BasicBlock::iterator CondIt = ++Cond->getIterator(); // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt; @@ -2095,7 +2128,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(I)) continue; - if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I)) + if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I)) return false; // I has only one use and can be executed unconditionally. Instruction *User = dyn_cast<Instruction>(I->user_back()); @@ -2192,17 +2225,17 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { Instruction *NewBonusInst = BonusInst->clone(); RemapInstruction(NewBonusInst, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); - VMap[BonusInst] = NewBonusInst; + VMap[&*BonusInst] = NewBonusInst; // If we moved a load, we cannot any longer claim any knowledge about // its potential value. The previous information might have been valid // only given the branch precondition. // For an analogous reason, we must also drop all the metadata whose // semantics we don't understand. - NewBonusInst->dropUnknownMetadata(LLVMContext::MD_dbg); + NewBonusInst->dropUnknownNonDebugMetadata(); - PredBlock->getInstList().insert(PBI, NewBonusInst); - NewBonusInst->takeName(BonusInst); + PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst); + NewBonusInst->takeName(&*BonusInst); BonusInst->setName(BonusInst->getName() + ".old"); } @@ -2211,7 +2244,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { Instruction *New = Cond->clone(); RemapInstruction(New, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); - PredBlock->getInstList().insert(PBI, New); + PredBlock->getInstList().insert(PBI->getIterator(), New); New->takeName(Cond); Cond->setName(New->getName() + ".old"); @@ -2332,11 +2365,297 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { return false; } +// If there is only one store in BB1 and BB2, return it, otherwise return +// nullptr. +static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) { + StoreInst *S = nullptr; + for (auto *BB : {BB1, BB2}) { + if (!BB) + continue; + for (auto &I : *BB) + if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (S) + // Multiple stores seen. + return nullptr; + else + S = SI; + } + } + return S; +} + +static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB, + Value *AlternativeV = nullptr) { + // PHI is going to be a PHI node that allows the value V that is defined in + // BB to be referenced in BB's only successor. + // + // If AlternativeV is nullptr, the only value we care about in PHI is V. It + // doesn't matter to us what the other operand is (it'll never get used). We + // could just create a new PHI with an undef incoming value, but that could + // increase register pressure if EarlyCSE/InstCombine can't fold it with some + // other PHI. So here we directly look for some PHI in BB's successor with V + // as an incoming operand. If we find one, we use it, else we create a new + // one. + // + // If AlternativeV is not nullptr, we care about both incoming values in PHI. + // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV] + // where OtherBB is the single other predecessor of BB's only successor. + PHINode *PHI = nullptr; + BasicBlock *Succ = BB->getSingleSuccessor(); + + for (auto I = Succ->begin(); isa<PHINode>(I); ++I) + if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) { + PHI = cast<PHINode>(I); + if (!AlternativeV) + break; + + assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2); + auto PredI = pred_begin(Succ); + BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI; + if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV) + break; + PHI = nullptr; + } + if (PHI) + return PHI; + + // If V is not an instruction defined in BB, just return it. + if (!AlternativeV && + (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB)) + return V; + + PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front()); + PHI->addIncoming(V, BB); + for (BasicBlock *PredBB : predecessors(Succ)) + if (PredBB != BB) + PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()), + PredBB); + return PHI; +} + +static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, + BasicBlock *QTB, BasicBlock *QFB, + BasicBlock *PostBB, Value *Address, + bool InvertPCond, bool InvertQCond) { + auto IsaBitcastOfPointerType = [](const Instruction &I) { + return Operator::getOpcode(&I) == Instruction::BitCast && + I.getType()->isPointerTy(); + }; + + // If we're not in aggressive mode, we only optimize if we have some + // confidence that by optimizing we'll allow P and/or Q to be if-converted. + auto IsWorthwhile = [&](BasicBlock *BB) { + if (!BB) + return true; + // Heuristic: if the block can be if-converted/phi-folded and the + // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to + // thread this store. + unsigned N = 0; + for (auto &I : *BB) { + // Cheap instructions viable for folding. + if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) || + isa<StoreInst>(I)) + ++N; + // Free instructions. + else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + IsaBitcastOfPointerType(I)) + continue; + else + return false; + } + return N <= PHINodeFoldingThreshold; + }; + + if (!MergeCondStoresAggressively && (!IsWorthwhile(PTB) || + !IsWorthwhile(PFB) || + !IsWorthwhile(QTB) || + !IsWorthwhile(QFB))) + return false; + + // For every pointer, there must be exactly two stores, one coming from + // PTB or PFB, and the other from QTB or QFB. We don't support more than one + // store (to any address) in PTB,PFB or QTB,QFB. + // FIXME: We could relax this restriction with a bit more work and performance + // testing. + StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB); + StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB); + if (!PStore || !QStore) + return false; + + // Now check the stores are compatible. + if (!QStore->isUnordered() || !PStore->isUnordered()) + return false; + + // Check that sinking the store won't cause program behavior changes. Sinking + // the store out of the Q blocks won't change any behavior as we're sinking + // from a block to its unconditional successor. But we're moving a store from + // the P blocks down through the middle block (QBI) and past both QFB and QTB. + // So we need to check that there are no aliasing loads or stores in + // QBI, QTB and QFB. We also need to check there are no conflicting memory + // operations between PStore and the end of its parent block. + // + // The ideal way to do this is to query AliasAnalysis, but we don't + // preserve AA currently so that is dangerous. Be super safe and just + // check there are no other memory operations at all. + for (auto &I : *QFB->getSinglePredecessor()) + if (I.mayReadOrWriteMemory()) + return false; + for (auto &I : *QFB) + if (&I != QStore && I.mayReadOrWriteMemory()) + return false; + if (QTB) + for (auto &I : *QTB) + if (&I != QStore && I.mayReadOrWriteMemory()) + return false; + for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end(); + I != E; ++I) + if (&*I != PStore && I->mayReadOrWriteMemory()) + return false; + + // OK, we're going to sink the stores to PostBB. The store has to be + // conditional though, so first create the predicate. + Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()) + ->getCondition(); + Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()) + ->getCondition(); + + Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), + PStore->getParent()); + Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(), + QStore->getParent(), PPHI); + + IRBuilder<> QB(&*PostBB->getFirstInsertionPt()); + + Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); + Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); + + if (InvertPCond) + PPred = QB.CreateNot(PPred); + if (InvertQCond) + QPred = QB.CreateNot(QPred); + Value *CombinedPred = QB.CreateOr(PPred, QPred); + + auto *T = + SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(), false); + QB.SetInsertPoint(T); + StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address)); + AAMDNodes AAMD; + PStore->getAAMetadata(AAMD, /*Merge=*/false); + PStore->getAAMetadata(AAMD, /*Merge=*/true); + SI->setAAMetadata(AAMD); + + QStore->eraseFromParent(); + PStore->eraseFromParent(); + + return true; +} + +static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) { + // The intention here is to find diamonds or triangles (see below) where each + // conditional block contains a store to the same address. Both of these + // stores are conditional, so they can't be unconditionally sunk. But it may + // be profitable to speculatively sink the stores into one merged store at the + // end, and predicate the merged store on the union of the two conditions of + // PBI and QBI. + // + // This can reduce the number of stores executed if both of the conditions are + // true, and can allow the blocks to become small enough to be if-converted. + // This optimization will also chain, so that ladders of test-and-set + // sequences can be if-converted away. + // + // We only deal with simple diamonds or triangles: + // + // PBI or PBI or a combination of the two + // / \ | \ + // PTB PFB | PFB + // \ / | / + // QBI QBI + // / \ | \ + // QTB QFB | QFB + // \ / | / + // PostBB PostBB + // + // We model triangles as a type of diamond with a nullptr "true" block. + // Triangles are canonicalized so that the fallthrough edge is represented by + // a true condition, as in the diagram above. + // + BasicBlock *PTB = PBI->getSuccessor(0); + BasicBlock *PFB = PBI->getSuccessor(1); + BasicBlock *QTB = QBI->getSuccessor(0); + BasicBlock *QFB = QBI->getSuccessor(1); + BasicBlock *PostBB = QFB->getSingleSuccessor(); + + bool InvertPCond = false, InvertQCond = false; + // Canonicalize fallthroughs to the true branches. + if (PFB == QBI->getParent()) { + std::swap(PFB, PTB); + InvertPCond = true; + } + if (QFB == PostBB) { + std::swap(QFB, QTB); + InvertQCond = true; + } + + // From this point on we can assume PTB or QTB may be fallthroughs but PFB + // and QFB may not. Model fallthroughs as a nullptr block. + if (PTB == QBI->getParent()) + PTB = nullptr; + if (QTB == PostBB) + QTB = nullptr; + + // Legality bailouts. We must have at least the non-fallthrough blocks and + // the post-dominating block, and the non-fallthroughs must only have one + // predecessor. + auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) { + return BB->getSinglePredecessor() == P && + BB->getSingleSuccessor() == S; + }; + if (!PostBB || + !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || + !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB)) + return false; + if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) || + (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB))) + return false; + if (PostBB->getNumUses() != 2 || QBI->getParent()->getNumUses() != 2) + return false; + + // OK, this is a sequence of two diamonds or triangles. + // Check if there are stores in PTB or PFB that are repeated in QTB or QFB. + SmallPtrSet<Value *,4> PStoreAddresses, QStoreAddresses; + for (auto *BB : {PTB, PFB}) { + if (!BB) + continue; + for (auto &I : *BB) + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) + PStoreAddresses.insert(SI->getPointerOperand()); + } + for (auto *BB : {QTB, QFB}) { + if (!BB) + continue; + for (auto &I : *BB) + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) + QStoreAddresses.insert(SI->getPointerOperand()); + } + + set_intersect(PStoreAddresses, QStoreAddresses); + // set_intersect mutates PStoreAddresses in place. Rename it here to make it + // clear what it contains. + auto &CommonAddresses = PStoreAddresses; + + bool Changed = false; + for (auto *Address : CommonAddresses) + Changed |= mergeConditionalStoreToAddress( + PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond); + return Changed; +} + /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the /// successor blocks of PBI - PBI branches to BI. -static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { +static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, + const DataLayout &DL) { assert(PBI->isConditional() && BI->isConditional()); BasicBlock *BB = BI->getParent(); @@ -2360,10 +2679,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // simplifycfg will thread the block. if (BlockIsSimpleEnoughToThreadThrough(BB)) { pred_iterator PB = pred_begin(BB), PE = pred_end(BB); - PHINode *NewPN = PHINode::Create(Type::getInt1Ty(BB->getContext()), - std::distance(PB, PE), - BI->getCondition()->getName() + ".pr", - BB->begin()); + PHINode *NewPN = PHINode::Create( + Type::getInt1Ty(BB->getContext()), std::distance(PB, PE), + BI->getCondition()->getName() + ".pr", &BB->front()); // Okay, we're going to insert the PHI node. Since PBI is not the only // predecessor, compute the PHI'd conditional value for all of the preds. // Any predecessor where the condition is not computable we keep symbolic. @@ -2386,6 +2704,29 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { } } + if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition())) + if (CE->canTrap()) + return false; + + // If BI is reached from the true path of PBI and PBI's condition implies + // BI's condition, we know the direction of the BI branch. + if (PBI->getSuccessor(0) == BI->getParent() && + isImpliedCondition(PBI->getCondition(), BI->getCondition(), DL) && + PBI->getSuccessor(0) != PBI->getSuccessor(1) && + BB->getSinglePredecessor()) { + // Turn this into a branch on constant. + auto *OldCond = BI->getCondition(); + BI->setCondition(ConstantInt::getTrue(BB->getContext())); + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + return true; // Nuke the branch on constant. + } + + // If both branches are conditional and both contain stores to the same + // address, remove the stores from the conditionals and create a conditional + // merged store at the end. + if (MergeCondStores && mergeConditionalStores(PBI, BI)) + return true; + // If this is a conditional branch in an empty block, and if any // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. @@ -2396,11 +2737,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { if (&*BBI != BI) return false; - - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition())) - if (CE->canTrap()) - return false; - int PBIOp, BIOp; if (PBI->getSuccessor(0) == BI->getSuccessor(0)) PBIOp = BIOp = 0; @@ -2565,15 +2901,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr; // Then remove the rest. - for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) { - BasicBlock *Succ = OldTerm->getSuccessor(I); + for (BasicBlock *Succ : OldTerm->successors()) { // Make sure only to keep exactly one copy of each edge. if (Succ == KeepEdge1) KeepEdge1 = nullptr; else if (Succ == KeepEdge2) KeepEdge2 = nullptr; else - Succ->removePredecessor(OldTerm->getParent()); + Succ->removePredecessor(OldTerm->getParent(), + /*DontDeleteUselessPHIs=*/true); } IRBuilder<> Builder(OldTerm); @@ -2827,7 +3163,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, Values.erase(std::unique(Values.begin(), Values.end()), Values.end()); // If Extra was used, we require at least two switch values to do the - // transformation. A switch with one value is just an cond branch. + // transformation. A switch with one value is just a conditional branch. if (ExtraCase && Values.size() < 2) return false; // TODO: Preserve branch weight metadata, similarly to how @@ -2847,7 +3183,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, // then we evaluate them with an explicit branch first. Split the block // right before the condbr to handle it. if (ExtraCase) { - BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test"); + BasicBlock *NewBB = + BB->splitBasicBlock(BI->getIterator(), "switch.early.test"); // Remove the uncond branch added to the old block. TerminatorInst *OldTI = BB->getTerminator(); Builder.SetInsertPoint(OldTI); @@ -2911,34 +3248,15 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { return false; // Check that there are no other instructions except for debug intrinsics. - BasicBlock::iterator I = LPInst, E = RI; + BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator(); while (++I != E) if (!isa<DbgInfoIntrinsic>(I)) return false; // Turn all invokes that unwind here into calls and delete the basic block. for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { - InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator()); - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - // Insert a call instruction before the invoke. - CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); - Call->takeName(II); - Call->setCallingConv(II->getCallingConv()); - Call->setAttributes(II->getAttributes()); - Call->setDebugLoc(II->getDebugLoc()); - - // Anything that used the value produced by the invoke instruction now uses - // the value produced by the call instruction. Note that we do this even - // for void functions and calls with no uses so that the callgraph edge is - // updated. - II->replaceAllUsesWith(Call); - BB->removePredecessor(II->getParent()); - - // Insert a branch to the normal destination right before the invoke. - BranchInst::Create(II->getNormalDest(), II); - - // Finally, delete the invoke instruction! - II->eraseFromParent(); + BasicBlock *Pred = *PI++; + removeUnwindEdge(Pred); } // The landingpad is now unreachable. Zap it. @@ -2946,6 +3264,124 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { return true; } +bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) { + // If this is a trivial cleanup pad that executes no instructions, it can be + // eliminated. If the cleanup pad continues to the caller, any predecessor + // that is an EH pad will be updated to continue to the caller and any + // predecessor that terminates with an invoke instruction will have its invoke + // instruction converted to a call instruction. If the cleanup pad being + // simplified does not continue to the caller, each predecessor will be + // updated to continue to the unwind destination of the cleanup pad being + // simplified. + BasicBlock *BB = RI->getParent(); + CleanupPadInst *CPInst = RI->getCleanupPad(); + if (CPInst->getParent() != BB) + // This isn't an empty cleanup. + return false; + + // Check that there are no other instructions except for debug intrinsics. + BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator(); + while (++I != E) + if (!isa<DbgInfoIntrinsic>(I)) + return false; + + // If the cleanup return we are simplifying unwinds to the caller, this will + // set UnwindDest to nullptr. + BasicBlock *UnwindDest = RI->getUnwindDest(); + Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr; + + // We're about to remove BB from the control flow. Before we do, sink any + // PHINodes into the unwind destination. Doing this before changing the + // control flow avoids some potentially slow checks, since we can currently + // be certain that UnwindDest and BB have no common predecessors (since they + // are both EH pads). + if (UnwindDest) { + // First, go through the PHI nodes in UnwindDest and update any nodes that + // reference the block we are removing + for (BasicBlock::iterator I = UnwindDest->begin(), + IE = DestEHPad->getIterator(); + I != IE; ++I) { + PHINode *DestPN = cast<PHINode>(I); + + int Idx = DestPN->getBasicBlockIndex(BB); + // Since BB unwinds to UnwindDest, it has to be in the PHI node. + assert(Idx != -1); + // This PHI node has an incoming value that corresponds to a control + // path through the cleanup pad we are removing. If the incoming + // value is in the cleanup pad, it must be a PHINode (because we + // verified above that the block is otherwise empty). Otherwise, the + // value is either a constant or a value that dominates the cleanup + // pad being removed. + // + // Because BB and UnwindDest are both EH pads, all of their + // predecessors must unwind to these blocks, and since no instruction + // can have multiple unwind destinations, there will be no overlap in + // incoming blocks between SrcPN and DestPN. + Value *SrcVal = DestPN->getIncomingValue(Idx); + PHINode *SrcPN = dyn_cast<PHINode>(SrcVal); + + // Remove the entry for the block we are deleting. + DestPN->removeIncomingValue(Idx, false); + + if (SrcPN && SrcPN->getParent() == BB) { + // If the incoming value was a PHI node in the cleanup pad we are + // removing, we need to merge that PHI node's incoming values into + // DestPN. + for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues(); + SrcIdx != SrcE; ++SrcIdx) { + DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx), + SrcPN->getIncomingBlock(SrcIdx)); + } + } else { + // Otherwise, the incoming value came from above BB and + // so we can just reuse it. We must associate all of BB's + // predecessors with this value. + for (auto *pred : predecessors(BB)) { + DestPN->addIncoming(SrcVal, pred); + } + } + } + + // Sink any remaining PHI nodes directly into UnwindDest. + Instruction *InsertPt = DestEHPad; + for (BasicBlock::iterator I = BB->begin(), + IE = BB->getFirstNonPHI()->getIterator(); + I != IE;) { + // The iterator must be incremented here because the instructions are + // being moved to another block. + PHINode *PN = cast<PHINode>(I++); + if (PN->use_empty()) + // If the PHI node has no uses, just leave it. It will be erased + // when we erase BB below. + continue; + + // Otherwise, sink this PHI node into UnwindDest. + // Any predecessors to UnwindDest which are not already represented + // must be back edges which inherit the value from the path through + // BB. In this case, the PHI value must reference itself. + for (auto *pred : predecessors(UnwindDest)) + if (pred != BB) + PN->addIncoming(PN, pred); + PN->moveBefore(InsertPt); + } + } + + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { + // The iterator must be updated here because we are removing this pred. + BasicBlock *PredBB = *PI++; + if (UnwindDest == nullptr) { + removeUnwindEdge(PredBB); + } else { + TerminatorInst *TI = PredBB->getTerminator(); + TI->replaceUsesOfWith(BB, UnwindDest); + } + } + + // The cleanup pad is now unreachable. Zap it. + BB->eraseFromParent(); + return true; +} + bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { BasicBlock *BB = RI->getParent(); if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false; @@ -3003,8 +3439,8 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { // If there are any instructions immediately before the unreachable that can // be removed, do so. - while (UI != BB->begin()) { - BasicBlock::iterator BBI = UI; + while (UI->getIterator() != BB->begin()) { + BasicBlock::iterator BBI = UI->getIterator(); --BBI; // Do not delete instructions that can have side effects which might cause // the unreachable to not be reachable; specifically, calls and volatile @@ -3075,26 +3511,18 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { --i; --e; Changed = true; } - } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) { - if (II->getUnwindDest() == BB) { - // Convert the invoke to a call instruction. This would be a good - // place to note that the call does not throw though. - BranchInst *BI = Builder.CreateBr(II->getNormalDest()); - II->removeFromParent(); // Take out of symbol table - - // Insert the call now... - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3); - Builder.SetInsertPoint(BI); - CallInst *CI = Builder.CreateCall(II->getCalledValue(), - Args, II->getName()); - CI->setCallingConv(II->getCallingConv()); - CI->setAttributes(II->getAttributes()); - // If the invoke produced a value, the call does now instead. - II->replaceAllUsesWith(CI); - delete II; - Changed = true; - } + } else if ((isa<InvokeInst>(TI) && + cast<InvokeInst>(TI)->getUnwindDest() == BB) || + isa<CatchSwitchInst>(TI)) { + removeUnwindEdge(TI->getParent()); + Changed = true; + } else if (isa<CleanupReturnInst>(TI)) { + new UnreachableInst(TI->getContext(), TI); + TI->eraseFromParent(); + Changed = true; } + // TODO: We can remove a catchswitch if all it's catchpads end in + // unreachable. } // If this block is now dead, remove it. @@ -3249,6 +3677,29 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, } } + // If we can prove that the cases must cover all possible values, the + // default destination becomes dead and we can remove it. If we know some + // of the bits in the value, we can use that to more precisely compute the + // number of possible unique case values. + bool HasDefault = + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + const unsigned NumUnknownBits = Bits - + (KnownZero.Or(KnownOne)).countPopulation(); + assert(NumUnknownBits <= Bits); + if (HasDefault && DeadCases.empty() && + NumUnknownBits < 64 /* avoid overflow */ && + SI->getNumCases() == (1ULL << NumUnknownBits)) { + DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); + BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(), + SI->getParent(), ""); + SI->setDefaultDest(&*NewDefault); + SplitBlock(&*NewDefault, &NewDefault->front()); + auto *OldTI = NewDefault->getTerminator(); + new UnreachableInst(SI->getContext(), OldTI); + EraseTerminatorInstAndDCECond(OldTI); + return true; + } + SmallVector<uint64_t, 8> Weights; bool HasWeight = HasBranchWeights(SI); if (HasWeight) { @@ -3439,7 +3890,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, } else if (isa<DbgInfoIntrinsic>(I)) { // Skip debug intrinsic. continue; - } else if (Constant *C = ConstantFold(I, DL, ConstantPool)) { + } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) { // Instruction is side-effect free and constant. // If the instruction has uses outside this block or a phi node slot for @@ -3456,7 +3907,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, return false; } - ConstantPool.insert(std::make_pair(I, C)); + ConstantPool.insert(std::make_pair(&*I, C)); } else { break; } @@ -3664,7 +4115,7 @@ namespace { /// Return true if a table with TableSize elements of /// type ElementType would fit in a target-legal register. static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, - const Type *ElementType); + Type *ElementType); private: // Depending on the contents of the table, it can be represented in @@ -3880,8 +4331,8 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, - const Type *ElementType) { - const IntegerType *IT = dyn_cast<IntegerType>(ElementType); + Type *ElementType) { + auto *IT = dyn_cast<IntegerType>(ElementType); if (!IT) return false; // FIXME: If the type is wider than it needs to be, e.g. i8 but all values @@ -3992,7 +4443,7 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock, assert((CaseConst == TrueConst || CaseConst == FalseConst) && "Expect true or false as compare result."); } - + // Check if the branch instruction dominates the phi node. It's a simple // dominance check, but sufficient for our needs. // Although this check is invariant in the calling loops, it's better to do it @@ -4422,7 +4873,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ return true; // If the Terminator is the only non-phi instruction, simplify the block. - BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(); + BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && TryToSimplifyUncondBranchFromEmptyBlock(BB)) return true; @@ -4457,6 +4908,16 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ return false; } +static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) { + BasicBlock *PredPred = nullptr; + for (auto *P : predecessors(BB)) { + BasicBlock *PPred = P->getSinglePredecessor(); + if (!PPred || (PredPred && PredPred != PPred)) + return nullptr; + PredPred = PPred; + } + return PredPred; +} bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); @@ -4537,9 +4998,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (SimplifyCondBranchToCondBranch(PBI, BI)) + if (SimplifyCondBranchToCondBranch(PBI, BI, DL)) return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; + // Look for diamond patterns. + if (MergeCondStores) + if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) + if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator())) + if (PBI != BI && PBI->isConditional()) + if (mergeConditionalStores(PBI, BI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; + return false; } @@ -4663,6 +5132,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { if (SimplifyReturn(RI, Builder)) return true; } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) { if (SimplifyResume(RI, Builder)) return true; + } else if (CleanupReturnInst *RI = + dyn_cast<CleanupReturnInst>(BB->getTerminator())) { + if (SimplifyCleanupReturn(RI)) return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { if (SimplifySwitch(SI, Builder)) return true; } else if (UnreachableInst *UI = diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index ab30aa17c76b..ddd8775a8431 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -47,15 +47,16 @@ namespace { Loop *L; LoopInfo *LI; ScalarEvolution *SE; + DominatorTree *DT; SmallVectorImpl<WeakVH> &DeadInsts; bool Changed; public: - SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI, - SmallVectorImpl<WeakVH> &Dead) - : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) { + SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT, + LoopInfo *LI,SmallVectorImpl<WeakVH> &Dead) + : L(Loop), LI(LI), SE(SE), DT(DT), DeadInsts(Dead), Changed(false) { assert(LI && "IV simplification requires LoopInfo"); } @@ -63,11 +64,13 @@ namespace { /// Iteratively perform simplification on a worklist of users of the /// specified induction variable. This is the top-level driver that applies - /// all simplicitions to users of an IV. + /// all simplifications to users of an IV. void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr); Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand); + bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand); + bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand); void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand, @@ -166,19 +169,65 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { S = SE->getSCEVAtScope(S, ICmpLoop); X = SE->getSCEVAtScope(X, ICmpLoop); + ICmpInst::Predicate InvariantPredicate; + const SCEV *InvariantLHS, *InvariantRHS; + // If the condition is always true or always false, replace it with // a constant value. - if (SE->isKnownPredicate(Pred, S, X)) + if (SE->isKnownPredicate(Pred, S, X)) { ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); - else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) + DeadInsts.emplace_back(ICmp); + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) { ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); - else + DeadInsts.emplace_back(ICmp); + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + } else if (isa<PHINode>(IVOperand) && + SE->isLoopInvariantPredicate(Pred, S, X, ICmpLoop, + InvariantPredicate, InvariantLHS, + InvariantRHS)) { + + // Rewrite the comparison to a loop invariant comparison if it can be done + // cheaply, where cheaply means "we don't need to emit any new + // instructions". + + Value *NewLHS = nullptr, *NewRHS = nullptr; + + if (S == InvariantLHS || X == InvariantLHS) + NewLHS = + ICmp->getOperand(S == InvariantLHS ? IVOperIdx : (1 - IVOperIdx)); + + if (S == InvariantRHS || X == InvariantRHS) + NewRHS = + ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx)); + + for (Value *Incoming : cast<PHINode>(IVOperand)->incoming_values()) { + if (NewLHS && NewRHS) + break; + + const SCEV *IncomingS = SE->getSCEV(Incoming); + + if (!NewLHS && IncomingS == InvariantLHS) + NewLHS = Incoming; + if (!NewRHS && IncomingS == InvariantRHS) + NewRHS = Incoming; + } + + if (!NewLHS || !NewRHS) + // We could not find an existing value to replace either LHS or RHS. + // Generating new instructions has subtler tradeoffs, so avoid doing that + // for now. + return; + + DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n'); + ICmp->setPredicate(InvariantPredicate); + ICmp->setOperand(0, NewLHS); + ICmp->setOperand(1, NewRHS); + } else return; - DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); ++NumElimCmp; Changed = true; - DeadInsts.emplace_back(ICmp); } /// SimplifyIVUsers helper for eliminating useless @@ -207,8 +256,7 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem, Rem->replaceAllUsesWith(Rem->getOperand(0)); else { // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). - const SCEV *LessOne = - SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); + const SCEV *LessOne = SE->getMinusSCEV(S, SE->getOne(S->getType())); if (IsSigned && !SE->isKnownNonNegative(LessOne)) return; @@ -232,9 +280,9 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem, DeadInsts.emplace_back(Rem); } -/// Eliminate an operation that consumes a simple IV and has -/// no observable side-effect given the range of IV values. -/// IVOperand is guaranteed SCEVable, but UseInst may not be. +/// Eliminate an operation that consumes a simple IV and has no observable +/// side-effect given the range of IV values. IVOperand is guaranteed SCEVable, +/// but UseInst may not be. bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, Instruction *IVOperand) { if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { @@ -249,12 +297,45 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, } } - // Eliminate any operation that SCEV can prove is an identity function. + if (eliminateIdentitySCEV(UseInst, IVOperand)) + return true; + + return false; +} + +/// Eliminate any operation that SCEV can prove is an identity function. +bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, + Instruction *IVOperand) { if (!SE->isSCEVable(UseInst->getType()) || (UseInst->getType() != IVOperand->getType()) || (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand))) return false; + // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the + // dominator tree, even if X is an operand to Y. For instance, in + // + // %iv = phi i32 {0,+,1} + // br %cond, label %left, label %merge + // + // left: + // %X = add i32 %iv, 0 + // br label %merge + // + // merge: + // %M = phi (%X, %iv) + // + // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and + // %M.replaceAllUsesWith(%X) would be incorrect. + + if (isa<PHINode>(UseInst)) + // If UseInst is not a PHI node then we know that IVOperand dominates + // UseInst directly from the legality of SSA. + if (!DT || !DT->dominates(IVOperand, UseInst)) + return false; + + if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand)) + return false; + DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n'); UseInst->replaceAllUsesWith(IVOperand); @@ -436,8 +517,8 @@ static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) { /// This algorithm does not require IVUsers analysis. Instead, it simplifies /// instructions in-place during analysis. Rather than rewriting induction /// variables bottom-up from their users, it transforms a chain of IVUsers -/// top-down, updating the IR only when it encouters a clear optimization -/// opportunitiy. +/// top-down, updating the IR only when it encounters a clear optimization +/// opportunity. /// /// Once DisableIVRewrite is default, LSR will be the only client of IVUsers. /// @@ -513,22 +594,21 @@ void IVVisitor::anchor() { } /// Simplify instructions that use this induction variable /// by using ScalarEvolution to analyze the IV's recurrence. -bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead, IVVisitor *V) -{ - LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead); +bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT, + LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead, + IVVisitor *V) { + SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Dead); SIV.simplifyUsers(CurrIV, V); return SIV.hasChanged(); } /// Simplify users of induction variables within this /// loop. This does not actually change or add IVs. -bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead) { +bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT, + LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead) { bool Changed = false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead); + Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead); } return Changed; } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp index c499c87b1f0b..d5377f9a4c1f 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -20,12 +20,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -64,7 +64,7 @@ namespace { // Here be subtlety: the iterator must be incremented before the loop // body (not sure why), so a range-for loop won't work here. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = BI++; + Instruction *I = &*BI++; // The first time through the loop ToSimplify is empty and we try to // simplify all instructions. On later iterations ToSimplify is not // empty and we only bother simplifying instructions that are in it. diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 6bbf8287e223..81dea6d1b9ae 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" @@ -30,8 +31,8 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; @@ -52,16 +53,8 @@ static cl::opt<bool> //===----------------------------------------------------------------------===// static bool ignoreCallingConv(LibFunc::Func Func) { - switch (Func) { - case LibFunc::abs: - case LibFunc::labs: - case LibFunc::llabs: - case LibFunc::strlen: - return true; - default: - return false; - } - llvm_unreachable("All cases should be covered in the switch."); + return Func == LibFunc::abs || Func == LibFunc::labs || + Func == LibFunc::llabs || Func == LibFunc::strlen; } /// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the @@ -93,16 +86,13 @@ static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { } static bool callHasFloatingPointArgument(const CallInst *CI) { - for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); - it != e; ++it) { - if ((*it)->getType()->isFloatingPointTy()) - return true; - } - return false; + return std::any_of(CI->op_begin(), CI->op_end(), [](const Use &OI) { + return OI->getType()->isFloatingPointTy(); + }); } /// \brief Check whether the overloaded unary floating point function -/// corresponing to \a Ty is available. +/// corresponding to \a Ty is available. static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, LibFunc::Func DoubleFn, LibFunc::Func FloatFn, LibFunc::Func LongDoubleFn) { @@ -116,6 +106,23 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, } } +/// \brief Check whether we can use unsafe floating point math for +/// the function passed as input. +static bool canUseUnsafeFPMath(Function *F) { + + // FIXME: For finer-grain optimization, we need intrinsics to have the same + // fast-math flag decorations that are applied to FP instructions. For now, + // we have to rely on the function-level unsafe-fp-math attribute to do this + // optimization because there's no other way to express that the call can be + // relaxed. + if (F->hasFnAttribute("unsafe-fp-math")) { + Attribute Attr = F->getFnAttribute("unsafe-fp-math"); + if (Attr.getValueAsString() == "true") + return true; + } + return false; +} + /// \brief Returns whether \p F matches the signature expected for the /// string/memory copying library function \p Func. /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset. @@ -467,9 +474,6 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // Verify the "stpcpy" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy)) return nullptr; @@ -484,7 +488,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { if (Len == 0) return nullptr; - Type *PT = FT->getParamType(0); + Type *PT = Callee->getFunctionType()->getParamType(0); Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len); Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); @@ -497,8 +501,6 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy)) return nullptr; @@ -531,7 +533,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { if (Len > SrcLen + 1) return nullptr; - Type *PT = FT->getParamType(0); + Type *PT = Callee->getFunctionType()->getParamType(0); // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1); @@ -862,6 +864,27 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { return B.CreateSub(LHSV, RHSV, "chardiff"); } + // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 + if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { + + IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); + unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); + + if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment && + getKnownAlignment(RHS, DL, CI) >= PrefAlignment) { + + Type *LHSPtrTy = + IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); + Type *RHSPtrTy = + IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); + + Value *LHSV = B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv"); + Value *RHSV = B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv"); + + return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp"); + } + } + // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) StringRef LHSStr, RHSStr; if (getConstantStringInfo(LHS, LHSStr) && @@ -972,7 +995,7 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, // floor((double)floatval) -> (double)floorf(floatval) if (Callee->isIntrinsic()) { - Module *M = CI->getParent()->getParent()->getParent(); + Module *M = CI->getModule(); Intrinsic::ID IID = Callee->getIntrinsicID(); Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); V = B.CreateCall(F, V); @@ -1015,9 +1038,9 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; - if (UnsafeFPShrink && Callee->getName() == "cos" && TLI->has(LibFunc::cosf)) { + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); - } FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the @@ -1035,13 +1058,37 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) { return Ret; } +static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { + // Multiplications calculated using Addition Chains. + // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html + + assert(Exp != 0 && "Incorrect exponent 0 not handled"); + + if (InnerChain[Exp]) + return InnerChain[Exp]; + + static const unsigned AddChain[33][2] = { + {0, 0}, // Unused. + {0, 0}, // Unused (base case = pow1). + {1, 1}, // Unused (pre-computed). + {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4}, + {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7}, + {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10}, + {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, + {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, + }; + + InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), + getPow(InnerChain, AddChain[Exp][1], B)); + return InnerChain[Exp]; +} + Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - Value *Ret = nullptr; - if (UnsafeFPShrink && Callee->getName() == "pow" && TLI->has(LibFunc::powf)) { + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); - } FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the @@ -1060,7 +1107,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { if (Op1C->isExactlyValue(2.0) && hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f, LibFunc::exp2l)) - return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); + return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B, + Callee->getAttributes()); // pow(10.0, x) -> exp10(x) if (Op1C->isExactlyValue(10.0) && hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f, @@ -1069,6 +1117,32 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { Callee->getAttributes()); } + bool unsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent()); + + // pow(exp(x), y) -> exp(x*y) + // pow(exp2(x), y) -> exp2(x * y) + // We enable these only under fast-math. Besides rounding + // differences the transformation changes overflow and + // underflow behavior quite dramatically. + // Example: x = 1000, y = 0.001. + // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1). + if (unsafeFPMath) { + if (auto *OpC = dyn_cast<CallInst>(Op1)) { + IRBuilder<>::FastMathFlagGuard Guard(B); + FastMathFlags FMF; + FMF.setUnsafeAlgebra(); + B.SetFastMathFlags(FMF); + + LibFunc::Func Func; + Function *OpCCallee = OpC->getCalledFunction(); + if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) && + TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) + return EmitUnaryFloatFnCall( + B.CreateFMul(OpC->getArgOperand(0), Op2, "mul"), + OpCCallee->getName(), B, OpCCallee->getAttributes()); + } + } + ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); if (!Op2C) return Ret; @@ -1081,10 +1155,15 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { LibFunc::sqrtl) && hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf, LibFunc::fabsl)) { + + // In -ffast-math, pow(x, 0.5) -> sqrt(x). + if (unsafeFPMath) + return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B, + Callee->getAttributes()); + // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). // This is faster than calling pow, and still handles negative zero // and negative infinity correctly. - // TODO: In fast-math mode, this could be just sqrt(x). // TODO: In finite-only mode, this could be just fabs(sqrt(x)). Value *Inf = ConstantFP::getInfinity(CI->getType()); Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); @@ -1102,18 +1181,42 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { return B.CreateFMul(Op1, Op1, "pow2"); if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip"); + + // In -ffast-math, generate repeated fmul instead of generating pow(x, n). + if (unsafeFPMath) { + APFloat V = abs(Op2C->getValueAPF()); + // We limit to a max of 7 fmul(s). Thus max exponent is 32. + // This transformation applies to integer exponents only. + if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan || + !V.isInteger()) + return nullptr; + + // We will memoize intermediate products of the Addition Chain. + Value *InnerChain[33] = {nullptr}; + InnerChain[1] = Op1; + InnerChain[2] = B.CreateFMul(Op1, Op1); + + // We cannot readily convert a non-double type (like float) to a double. + // So we first convert V to something which could be converted to double. + bool ignored; + V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); + Value *FMul = getPow(InnerChain, V.convertToDouble(), B); + // For negative exponents simply compute the reciprocal. + if (Op2C->isNegative()) + FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul); + return FMul; + } + return nullptr; } Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Function *Caller = CI->getParent()->getParent(); - Value *Ret = nullptr; - if (UnsafeFPShrink && Callee->getName() == "exp2" && - TLI->has(LibFunc::exp2f)) { + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); - } FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the @@ -1162,11 +1265,10 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - Value *Ret = nullptr; - if (Callee->getName() == "fabs" && TLI->has(LibFunc::fabsf)) { + StringRef Name = Callee->getName(); + if (Name == "fabs" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, false); - } FunctionType *FT = Callee->getFunctionType(); // Make sure this has 1 argument of FP type which matches the result type. @@ -1184,6 +1286,105 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) { return Ret; } +Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { + // If we can shrink the call to a float function rather than a double + // function, do that first. + Function *Callee = CI->getCalledFunction(); + StringRef Name = Callee->getName(); + if ((Name == "fmin" && hasFloatVersion(Name)) || + (Name == "fmax" && hasFloatVersion(Name))) { + Value *Ret = optimizeBinaryDoubleFP(CI, B); + if (Ret) + return Ret; + } + + // Make sure this has 2 arguments of FP type which match the result type. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + !FT->getParamType(0)->isFloatingPointTy()) + return nullptr; + + IRBuilder<>::FastMathFlagGuard Guard(B); + FastMathFlags FMF; + Function *F = CI->getParent()->getParent(); + if (canUseUnsafeFPMath(F)) { + // Unsafe algebra sets all fast-math-flags to true. + FMF.setUnsafeAlgebra(); + } else { + // At a minimum, no-nans-fp-math must be true. + Attribute Attr = F->getFnAttribute("no-nans-fp-math"); + if (Attr.getValueAsString() != "true") + return nullptr; + // No-signed-zeros is implied by the definitions of fmax/fmin themselves: + // "Ideally, fmax would be sensitive to the sign of zero, for example + // fmax(-0. 0, +0. 0) would return +0; however, implementation in software + // might be impractical." + FMF.setNoSignedZeros(); + FMF.setNoNaNs(); + } + B.SetFastMathFlags(FMF); + + // We have a relaxed floating-point environment. We can ignore NaN-handling + // and transform to a compare and select. We do not have to consider errno or + // exceptions, because fmin/fmax do not have those. + Value *Op0 = CI->getArgOperand(0); + Value *Op1 = CI->getArgOperand(1); + Value *Cmp = Callee->getName().startswith("fmin") ? + B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1); + return B.CreateSelect(Cmp, Op0, Op1); +} + +Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + Value *Ret = nullptr; + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && hasFloatVersion(Name)) + Ret = optimizeUnaryDoubleFP(CI, B, true); + FunctionType *FT = Callee->getFunctionType(); + + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + if (!canUseUnsafeFPMath(CI->getParent()->getParent())) + return Ret; + Value *Op1 = CI->getArgOperand(0); + auto *OpC = dyn_cast<CallInst>(Op1); + if (!OpC) + return Ret; + + // log(pow(x,y)) -> y*log(x) + // This is only applicable to log, log2, log10. + if (Name != "log" && Name != "log2" && Name != "log10") + return Ret; + + IRBuilder<>::FastMathFlagGuard Guard(B); + FastMathFlags FMF; + FMF.setUnsafeAlgebra(); + B.SetFastMathFlags(FMF); + + LibFunc::Func Func; + Function *F = OpC->getCalledFunction(); + if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && + Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow)) + return B.CreateFMul(OpC->getArgOperand(1), + EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, + Callee->getAttributes()), "mul"); + + // log(exp2(y)) -> y*log(2) + if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && + TLI->has(Func) && Func == LibFunc::exp2) + return B.CreateFMul( + OpC->getArgOperand(0), + EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), + Callee->getName(), B, Callee->getAttributes()), + "logmul"); + return Ret; +} + Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); @@ -1191,19 +1392,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" || Callee->getIntrinsicID() == Intrinsic::sqrt)) Ret = optimizeUnaryDoubleFP(CI, B, true); + if (!canUseUnsafeFPMath(CI->getParent()->getParent())) + return Ret; - // FIXME: For finer-grain optimization, we need intrinsics to have the same - // fast-math flag decorations that are applied to FP instructions. For now, - // we have to rely on the function-level unsafe-fp-math attribute to do this - // optimization because there's no other way to express that the sqrt can be - // reassociated. - Function *F = CI->getParent()->getParent(); - if (F->hasFnAttribute("unsafe-fp-math")) { - // Check for unsafe-fp-math = true. - Attribute Attr = F->getFnAttribute("unsafe-fp-math"); - if (Attr.getValueAsString() != "true") - return Ret; - } Value *Op = CI->getArgOperand(0); if (Instruction *I = dyn_cast<Instruction>(Op)) { if (I->getOpcode() == Instruction::FMul && I->hasUnsafeAlgebra()) { @@ -1238,8 +1429,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { // and multiply. // FIXME: We're not checking the sqrt because it doesn't have // fast-math-flags (see earlier comment). - IRBuilder<true, ConstantFolder, - IRBuilderDefaultInserter<true> >::FastMathFlagGuard Guard(B); + IRBuilder<>::FastMathFlagGuard Guard(B); B.SetFastMathFlags(I->getFastMathFlags()); // If we found a repeated factor, hoist it out of the square root and // replace it with the fabs of that factor. @@ -1262,6 +1452,40 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { return Ret; } +Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + Value *Ret = nullptr; + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name)) + Ret = optimizeUnaryDoubleFP(CI, B, true); + FunctionType *FT = Callee->getFunctionType(); + + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + if (!canUseUnsafeFPMath(CI->getParent()->getParent())) + return Ret; + Value *Op1 = CI->getArgOperand(0); + auto *OpC = dyn_cast<CallInst>(Op1); + if (!OpC) + return Ret; + + // tan(atan(x)) -> x + // tanf(atanf(x)) -> x + // tanl(atanl(x)) -> x + LibFunc::Func Func; + Function *F = OpC->getCalledFunction(); + if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && + ((Func == LibFunc::atan && Callee->getName() == "tan") || + (Func == LibFunc::atanf && Callee->getName() == "tanf") || + (Func == LibFunc::atanl && Callee->getName() == "tanl"))) + Ret = OpC->getArgOperand(0); + return Ret; +} + static bool isTrigLibCall(CallInst *CI); static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, bool UseFloat, Value *&Sin, Value *&Cos, @@ -1329,9 +1553,9 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, return; Function *Callee = CI->getCalledFunction(); - StringRef FuncName = Callee->getName(); LibFunc::Func Func; - if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || !isTrigLibCall(CI)) + if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) || + !isTrigLibCall(CI)) return; if (IsFloat) { @@ -1353,10 +1577,8 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls, Value *Res) { - for (SmallVectorImpl<CallInst *>::iterator I = Calls.begin(), E = Calls.end(); - I != E; ++I) { - replaceAllUsesWith(*I, Res); - } + for (CallInst *C : Calls) + replaceAllUsesWith(C, Res); } void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, @@ -1387,8 +1609,7 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { // If the argument is an instruction, it must dominate all uses so put our // sincos call there. - BasicBlock::iterator Loc = ArgInst; - B.SetInsertPoint(ArgInst->getParent(), ++Loc); + B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); } else { // Otherwise (e.g. for a constant) the beginning of the function is as // good a place as any. @@ -1413,15 +1634,16 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, // Integer Library Call Optimizations //===----------------------------------------------------------------------===// +static bool checkIntUnaryReturnAndParam(Function *Callee) { + FunctionType *FT = Callee->getFunctionType(); + return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) && + FT->getParamType(0)->isIntegerTy(); +} + Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy(32) || - !FT->getParamType(0)->isIntegerTy()) + if (!checkIntUnaryReturnAndParam(Callee)) return nullptr; - Value *Op = CI->getArgOperand(0); // Constant fold. @@ -1436,7 +1658,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { Type *ArgType = Op->getType(); Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); - Value *V = B.CreateCall(F, {Op, B.getFalse()}, "cttz"); + Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz"); V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); V = B.CreateIntCast(V, B.getInt32Ty(), false); @@ -1461,11 +1683,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) + if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // isdigit(c) -> (c-'0') <u 10 @@ -1476,11 +1694,7 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) + if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // isascii(c) -> c <u 128 @@ -1490,11 +1704,7 @@ Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // We require i32(i32) - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isIntegerTy(32)) + if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // toascii(c) -> c & 0x7f @@ -1529,10 +1739,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B, } static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { - if (!ColdErrorCalls) - return false; - - if (!Callee || !Callee->isDeclaration()) + if (!ColdErrorCalls || !Callee || !Callee->isDeclaration()) return false; if (StreamArg < 0) @@ -1968,16 +2175,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { // Command-line parameter overrides function attribute. if (EnableUnsafeFPShrink.getNumOccurrences() > 0) UnsafeFPShrink = EnableUnsafeFPShrink; - else if (Callee->hasFnAttribute("unsafe-fp-math")) { - // FIXME: This is the same problem as described in optimizeSqrt(). - // If calls gain access to IR-level FMF, then use that instead of a - // function attribute. - - // Check for unsafe-fp-math = true. - Attribute Attr = Callee->getFnAttribute("unsafe-fp-math"); - if (Attr.getValueAsString() == "true") - UnsafeFPShrink = true; - } + else if (canUseUnsafeFPMath(Callee)) + UnsafeFPShrink = true; // First, check for intrinsics. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { @@ -1990,6 +2189,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return optimizeExp2(CI, Builder); case Intrinsic::fabs: return optimizeFabs(CI, Builder); + case Intrinsic::log: + return optimizeLog(CI, Builder); case Intrinsic::sqrt: return optimizeSqrt(CI, Builder); default: @@ -2001,13 +2202,17 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) { // Try to further simplify the result. CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); - if (SimplifiedCI && SimplifiedCI->getCalledFunction()) - if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { + if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { + // Use an IR Builder from SimplifiedCI if available instead of CI + // to guarantee we reach all uses we might replace later on. + IRBuilder<> TmpBuilder(SimplifiedCI); + if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) { // If we were able to further simplify, remove the now redundant call. SimplifiedCI->replaceAllUsesWith(V); SimplifiedCI->eraseFromParent(); return V; } + } return SimplifiedFortifiedCI; } @@ -2068,8 +2273,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return optimizeFWrite(CI, Builder); case LibFunc::fputs: return optimizeFPuts(CI, Builder); + case LibFunc::log: + case LibFunc::log10: + case LibFunc::log1p: + case LibFunc::log2: + case LibFunc::logb: + return optimizeLog(CI, Builder); case LibFunc::puts: return optimizePuts(CI, Builder); + case LibFunc::tan: + case LibFunc::tanf: + case LibFunc::tanl: + return optimizeTan(CI, Builder); case LibFunc::perror: return optimizeErrorReporting(CI, Builder); case LibFunc::vfprintf: @@ -2097,24 +2312,23 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { case LibFunc::exp: case LibFunc::exp10: case LibFunc::expm1: - case LibFunc::log: - case LibFunc::log10: - case LibFunc::log1p: - case LibFunc::log2: - case LibFunc::logb: case LibFunc::sin: case LibFunc::sinh: - case LibFunc::tan: case LibFunc::tanh: if (UnsafeFPShrink && hasFloatVersion(FuncName)) return optimizeUnaryDoubleFP(CI, Builder, true); return nullptr; case LibFunc::copysign: - case LibFunc::fmin: - case LibFunc::fmax: if (hasFloatVersion(FuncName)) return optimizeBinaryDoubleFP(CI, Builder); return nullptr; + case LibFunc::fminf: + case LibFunc::fmin: + case LibFunc::fminl: + case LibFunc::fmaxf: + case LibFunc::fmax: + case LibFunc::fmaxl: + return optimizeFMinFMax(CI, Builder); default: return nullptr; } @@ -2133,37 +2347,27 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { Replacer(I, With); } -/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I, - Value *With) { - I->replaceAllUsesWith(With); - I->eraseFromParent(); -} - // TODO: // Additional cases that we need to add to this file: // // cbrt: // * cbrt(expN(X)) -> expN(x/3) // * cbrt(sqrt(x)) -> pow(x,1/6) -// * cbrt(sqrt(x)) -> pow(x,1/9) +// * cbrt(cbrt(x)) -> pow(x,1/9) // // exp, expf, expl: // * exp(log(x)) -> x // // log, logf, logl: // * log(exp(x)) -> x -// * log(x**y) -> y*log(x) // * log(exp(y)) -> y*log(e) -// * log(exp2(y)) -> y*log(2) // * log(exp10(y)) -> y*log(10) // * log(sqrt(x)) -> 0.5*log(x) -// * log(pow(x,y)) -> y*log(x) // // lround, lroundf, lroundl: // * lround(cnst) -> cnst' // // pow, powf, powl: -// * pow(exp(x),y) -> exp(x*y) // * pow(sqrt(x),y) -> pow(x,y*0.5) // * pow(pow(x,y),z)-> pow(x,y*z) // @@ -2179,9 +2383,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { // * sqrt(Nroot(x)) -> pow(x,1/(2*N)) // * sqrt(pow(x,y)) -> pow(|x|,y*0.5) // -// tan, tanf, tanl: -// * tan(atan(x)) -> x -// // trunc, truncf, truncl: // * trunc(cnst) -> cnst' // diff --git a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp new file mode 100644 index 000000000000..ad6b782caf8b --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp @@ -0,0 +1,85 @@ +//===- SplitModule.cpp - Split a module into partitions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the function llvm::SplitModule, which splits a module +// into multiple linkable partitions. It can be used to implement parallel code +// generation for link-time optimization. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SplitModule.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +static void externalize(GlobalValue *GV) { + if (GV->hasLocalLinkage()) { + GV->setLinkage(GlobalValue::ExternalLinkage); + GV->setVisibility(GlobalValue::HiddenVisibility); + } + + // Unnamed entities must be named consistently between modules. setName will + // give a distinct name to each such entity. + if (!GV->hasName()) + GV->setName("__llvmsplit_unnamed"); +} + +// Returns whether GV should be in partition (0-based) I of N. +static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) { + if (auto GA = dyn_cast<GlobalAlias>(GV)) + if (const GlobalObject *Base = GA->getBaseObject()) + GV = Base; + + StringRef Name; + if (const Comdat *C = GV->getComdat()) + Name = C->getName(); + else + Name = GV->getName(); + + // Partition by MD5 hash. We only need a few bits for evenness as the number + // of partitions will generally be in the 1-2 figure range; the low 16 bits + // are enough. + MD5 H; + MD5::MD5Result R; + H.update(Name); + H.final(R); + return (R[0] | (R[1] << 8)) % N == I; +} + +void llvm::SplitModule( + std::unique_ptr<Module> M, unsigned N, + std::function<void(std::unique_ptr<Module> MPart)> ModuleCallback) { + for (Function &F : *M) + externalize(&F); + for (GlobalVariable &GV : M->globals()) + externalize(&GV); + for (GlobalAlias &GA : M->aliases()) + externalize(&GA); + + // FIXME: We should be able to reuse M as the last partition instead of + // cloning it. + for (unsigned I = 0; I != N; ++I) { + ValueToValueMapTy VMap; + std::unique_ptr<Module> MPart( + CloneModule(M.get(), VMap, [=](const GlobalValue *GV) { + return isInPartition(GV, I, N); + })); + if (I != 0) + MPart->setModuleInlineAsm(""); + ModuleCallback(std::move(MPart)); + } +} diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index a2a54da8590c..1d1f602b041d 100644 --- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -69,7 +69,6 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/YAMLParser.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 7e00a80989dc..6b1d1dae5f01 100644 --- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -50,11 +50,11 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { // std::vector<BasicBlock*> ReturningBlocks; std::vector<BasicBlock*> UnreachableBlocks; - for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I) - if (isa<ReturnInst>(I->getTerminator())) - ReturningBlocks.push_back(I); - else if (isa<UnreachableInst>(I->getTerminator())) - UnreachableBlocks.push_back(I); + for (BasicBlock &I : F) + if (isa<ReturnInst>(I.getTerminator())) + ReturningBlocks.push_back(&I); + else if (isa<UnreachableInst>(I.getTerminator())) + UnreachableBlocks.push_back(&I); // Then unreachable blocks. if (UnreachableBlocks.empty()) { diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp index 8c72641da9e7..1add78e01657 100644 --- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -19,11 +19,14 @@ #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" using namespace llvm; // Out of line method to get vtable etc for class. void ValueMapTypeRemapper::anchor() {} void ValueMaterializer::anchor() {} +void ValueMaterializer::materializeInitFor(GlobalValue *New, GlobalValue *Old) { +} Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, @@ -35,15 +38,28 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // If we have a materializer and it can materialize a value, use that. if (Materializer) { - if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V))) - return VM[V] = NewV; + if (Value *NewV = + Materializer->materializeDeclFor(const_cast<Value *>(V))) { + VM[V] = NewV; + if (auto *NewGV = dyn_cast<GlobalValue>(NewV)) + Materializer->materializeInitFor( + NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V))); + return NewV; + } } // Global values do not need to be seeded into the VM if they // are using the identity mapping. - if (isa<GlobalValue>(V)) + if (isa<GlobalValue>(V)) { + if (Flags & RF_NullMapMissingGlobalValues) { + assert(!(Flags & RF_IgnoreMissingEntries) && + "Illegal to specify both RF_NullMapMissingGlobalValues and " + "RF_IgnoreMissingEntries"); + return nullptr; + } return VM[V] = const_cast<Value*>(V); - + } + if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) { // Inline asm may need *type* remapping. FunctionType *NewTy = IA->getFunctionType(); @@ -73,7 +89,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // correct. For now, just match behaviour from before the metadata/value // split. // - // assert(MappedMD && "Referenced metadata value not in value map"); + // assert((MappedMD || (Flags & RF_NullMapMissingGlobalValues)) && + // "Referenced metadata value not in value map"); return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD); } @@ -127,9 +144,13 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM, Flags, TypeMapper, Materializer)); } - + Type *NewSrcTy = nullptr; + if (TypeMapper) + if (auto *GEPO = dyn_cast<GEPOperator>(C)) + NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType()); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) - return VM[V] = CE->getWithOperands(Ops, NewTy); + return VM[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy); if (isa<ConstantArray>(C)) return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops); if (isa<ConstantStruct>(C)) @@ -146,29 +167,42 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, } static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key, - Metadata *Val) { + Metadata *Val, ValueMaterializer *Materializer, + RemapFlags Flags) { VM.MD()[Key].reset(Val); + if (Materializer && !(Flags & RF_HaveUnmaterializedMetadata)) { + auto *N = dyn_cast_or_null<MDNode>(Val); + // Need to invoke this once we have non-temporary MD. + if (!N || !N->isTemporary()) + Materializer->replaceTemporaryMetadata(Key, Val); + } return Val; } -static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) { - return mapToMetadata(VM, MD, const_cast<Metadata *>(MD)); +static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD, + ValueMaterializer *Materializer, RemapFlags Flags) { + return mapToMetadata(VM, MD, const_cast<Metadata *>(MD), Materializer, Flags); } static Metadata *MapMetadataImpl(const Metadata *MD, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer); -static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, +static Metadata *mapMetadataOp(Metadata *Op, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { if (!Op) return nullptr; - if (Metadata *MappedOp = - MapMetadataImpl(Op, Cycles, VM, Flags, TypeMapper, Materializer)) + + if (Materializer && !Materializer->isMetadataNeeded(Op)) + return nullptr; + + if (Metadata *MappedOp = MapMetadataImpl(Op, DistinctWorklist, VM, Flags, + TypeMapper, Materializer)) return MappedOp; // Use identity map if MappedOp is null and we can ignore missing entries. if (Flags & RF_IgnoreMissingEntries) @@ -178,89 +212,113 @@ static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, // correct. For now, just match behaviour from before the metadata/value // split. // - // llvm_unreachable("Referenced metadata not in value map!"); + // assert((Flags & RF_NullMapMissingGlobalValues) && + // "Referenced metadata not in value map!"); return nullptr; } -/// \brief Remap nodes. +/// Resolve uniquing cycles involving the given metadata. +static void resolveCycles(Metadata *MD, bool MDMaterialized) { + if (auto *N = dyn_cast_or_null<MDNode>(MD)) { + if (!MDMaterialized && N->isTemporary()) + return; + if (!N->isResolved()) + N->resolveCycles(MDMaterialized); + } +} + +/// Remap the operands of an MDNode. /// -/// Insert \c NewNode in the value map, and then remap \c OldNode's operands. -/// Assumes that \c NewNode is already a clone of \c OldNode. +/// If \c Node is temporary, uniquing cycles are ignored. If \c Node is +/// distinct, uniquing cycles are resolved as they're found. /// -/// \pre \c NewNode is a clone of \c OldNode. -static bool remap(const MDNode *OldNode, MDNode *NewNode, - SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, - RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer) { - assert(OldNode->getNumOperands() == NewNode->getNumOperands() && - "Expected nodes to match"); - assert(OldNode->isResolved() && "Expected resolved node"); - assert(!NewNode->isUniqued() && "Expected non-uniqued node"); - - // Map the node upfront so it's available for cyclic references. - mapToMetadata(VM, OldNode, NewNode); - bool AnyChanged = false; - for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) { - Metadata *Old = OldNode->getOperand(I); - assert(NewNode->getOperand(I) == Old && - "Expected old operands to already be in place"); +/// \pre \c Node.isDistinct() or \c Node.isTemporary(). +static bool remapOperands(MDNode &Node, + SmallVectorImpl<MDNode *> &DistinctWorklist, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + assert(!Node.isUniqued() && "Expected temporary or distinct node"); + const bool IsDistinct = Node.isDistinct(); - Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags, - TypeMapper, Materializer); + bool AnyChanged = false; + for (unsigned I = 0, E = Node.getNumOperands(); I != E; ++I) { + Metadata *Old = Node.getOperand(I); + Metadata *New = mapMetadataOp(Old, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); if (Old != New) { AnyChanged = true; - NewNode->replaceOperandWith(I, New); + Node.replaceOperandWith(I, New); + + // Resolve uniquing cycles underneath distinct nodes on the fly so they + // don't infect later operands. + if (IsDistinct) + resolveCycles(New, !(Flags & RF_HaveUnmaterializedMetadata)); } } return AnyChanged; } -/// \brief Map a distinct MDNode. +/// Map a distinct MDNode. /// -/// Distinct nodes are not uniqued, so they must always recreated. +/// Whether distinct nodes change is independent of their operands. If \a +/// RF_MoveDistinctMDs, then they are reused, and their operands remapped in +/// place; effectively, they're moved from one graph to another. Otherwise, +/// they're cloned/duplicated, and the new copy's operands are remapped. static Metadata *mapDistinctNode(const MDNode *Node, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { assert(Node->isDistinct() && "Expected distinct node"); - MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone()); - remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer); + MDNode *NewMD; + if (Flags & RF_MoveDistinctMDs) + NewMD = const_cast<MDNode *>(Node); + else + NewMD = MDNode::replaceWithDistinct(Node->clone()); - // Track any cycles beneath this node. - for (Metadata *Op : NewMD->operands()) - if (auto *Node = dyn_cast_or_null<MDNode>(Op)) - if (!Node->isResolved()) - Cycles.push_back(Node); - - return NewMD; + // Remap operands later. + DistinctWorklist.push_back(NewMD); + return mapToMetadata(VM, Node, NewMD, Materializer, Flags); } /// \brief Map a uniqued MDNode. /// /// Uniqued nodes may not need to be recreated (they may map to themselves). static Metadata *mapUniquedNode(const MDNode *Node, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { - assert(Node->isUniqued() && "Expected uniqued node"); + assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isUniqued()) && + "Expected uniqued node"); - // Create a temporary node upfront in case we have a metadata cycle. + // Create a temporary node and map it upfront in case we have a uniquing + // cycle. If necessary, this mapping will get updated by RAUW logic before + // returning. auto ClonedMD = Node->clone(); - if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer)) - // No operands changed, so use the identity mapping. - return mapToSelf(VM, Node); + mapToMetadata(VM, Node, ClonedMD.get(), Materializer, Flags); + if (!remapOperands(*ClonedMD, DistinctWorklist, VM, Flags, TypeMapper, + Materializer)) { + // No operands changed, so use the original. + ClonedMD->replaceAllUsesWith(const_cast<MDNode *>(Node)); + // Even though replaceAllUsesWith would have replaced the value map + // entry, we need to explictly map with the final non-temporary node + // to replace any temporary metadata via the callback. + return mapToSelf(VM, Node, Materializer, Flags); + } - // At least one operand has changed, so uniquify the cloned node. + // Uniquify the cloned node. Explicitly map it with the final non-temporary + // node so that replacement of temporary metadata via the callback occurs. return mapToMetadata(VM, Node, - MDNode::replaceWithUniqued(std::move(ClonedMD))); + MDNode::replaceWithUniqued(std::move(ClonedMD)), + Materializer, Flags); } static Metadata *MapMetadataImpl(const Metadata *MD, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { @@ -269,26 +327,28 @@ static Metadata *MapMetadataImpl(const Metadata *MD, return NewMD; if (isa<MDString>(MD)) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); if (isa<ConstantAsMetadata>(MD)) if ((Flags & RF_NoModuleLevelChanges)) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) { Value *MappedV = MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer); if (VMD->getValue() == MappedV || (!MappedV && (Flags & RF_IgnoreMissingEntries))) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); // FIXME: This assert crashes during bootstrap, but I think it should be // correct. For now, just match behaviour from before the metadata/value // split. // - // assert(MappedV && "Referenced metadata not in value map!"); + // assert((MappedV || (Flags & RF_NullMapMissingGlobalValues)) && + // "Referenced metadata not in value map!"); if (MappedV) - return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV)); + return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV), Materializer, + Flags); return nullptr; } @@ -299,37 +359,54 @@ static Metadata *MapMetadataImpl(const Metadata *MD, // If this is a module-level metadata and we know that nothing at the // module level is changing, then use an identity mapping. if (Flags & RF_NoModuleLevelChanges) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); // Require resolved nodes whenever metadata might be remapped. - assert(Node->isResolved() && "Unexpected unresolved node"); + assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isResolved()) && + "Unexpected unresolved node"); + + if (Materializer && Node->isTemporary()) { + assert(Flags & RF_HaveUnmaterializedMetadata); + Metadata *TempMD = + Materializer->mapTemporaryMetadata(const_cast<Metadata *>(MD)); + // If the above callback returned an existing temporary node, use it + // instead of the current temporary node. This happens when earlier + // function importing passes already created and saved a temporary + // metadata node for the same value id. + if (TempMD) { + mapToMetadata(VM, MD, TempMD, Materializer, Flags); + return TempMD; + } + } if (Node->isDistinct()) - return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); + return mapDistinctNode(Node, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); - return mapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); + return mapUniquedNode(Node, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); } Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { - SmallVector<MDNode *, 8> Cycles; - Metadata *NewMD = - MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer); - - // Resolve cycles underneath MD. - if (NewMD && NewMD != MD) { - if (auto *N = dyn_cast<MDNode>(NewMD)) - if (!N->isResolved()) - N->resolveCycles(); - - for (MDNode *N : Cycles) - if (!N->isResolved()) - N->resolveCycles(); - } else { - // Shouldn't get unresolved cycles if nothing was remapped. - assert(Cycles.empty() && "Expected no unresolved cycles"); - } + SmallVector<MDNode *, 8> DistinctWorklist; + Metadata *NewMD = MapMetadataImpl(MD, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); + + // When there are no module-level changes, it's possible that the metadata + // graph has temporaries. Skip the logic to resolve cycles, since it's + // unnecessary (and invalid) in that case. + if (Flags & RF_NoModuleLevelChanges) + return NewMD; + + // Resolve cycles involving the entry metadata. + resolveCycles(NewMD, !(Flags & RF_HaveUnmaterializedMetadata)); + + // Remap the operands of distinct MDNodes. + while (!DistinctWorklist.empty()) + remapOperands(*DistinctWorklist.pop_back_val(), DistinctWorklist, VM, Flags, + TypeMapper, Materializer); return NewMD; } @@ -374,14 +451,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, // Remap attached metadata. SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; I->getAllMetadata(MDs); - for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator - MI = MDs.begin(), - ME = MDs.end(); - MI != ME; ++MI) { - MDNode *Old = MI->second; + for (const auto &MI : MDs) { + MDNode *Old = MI.second; MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer); if (New != Old) - I->setMetadata(MI->first, New); + I->setMetadata(MI.first, New); } if (!TypeMapper) diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 215d6f9a1eb6..8844d574a79d 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -25,8 +25,11 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -204,9 +207,10 @@ namespace { BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { - AA = &P->getAnalysis<AliasAnalysis>(); + AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &P->getAnalysis<ScalarEvolution>(); + SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -221,6 +225,7 @@ namespace { AliasAnalysis *AA; DominatorTree *DT; ScalarEvolution *SE; + const TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; // FIXME: const correct? @@ -437,9 +442,10 @@ namespace { bool runOnBasicBlock(BasicBlock &BB) override { // OptimizeNone check deferred to vectorizeBB(). - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( @@ -450,13 +456,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { BasicBlockPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.setPreservesCFG(); } @@ -842,7 +850,7 @@ namespace { // It is important to cleanup here so that future iterations of this // function have less work to do. - (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo()); + (void)SimplifyInstructionsInBlock(&BB, TLI); return true; } @@ -1239,20 +1247,23 @@ namespace { if (I == Start) IAfterStart = true; bool IsSimpleLoadStore; - if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; + if (!isInstVectorizable(&*I, IsSimpleLoadStore)) + continue; // Look for an instruction with which to pair instruction *I... DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); + if (I->mayWriteToMemory()) + WriteSet.add(&*I); bool JAfterStart = IAfterStart; BasicBlock::iterator J = std::next(I); for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { - if (J == Start) JAfterStart = true; + if (&*J == Start) + JAfterStart = true; // Determine if J uses I, if so, exit the loop. - bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); + bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep); if (Config.FastDep) { // Note: For this heuristic to be effective, independent operations // must tend to be intermixed. This is likely to be true from some @@ -1269,25 +1280,26 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. int CostSavings, FixedOrder; - if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, - CostSavings, FixedOrder)) continue; + if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, + CostSavings, FixedOrder)) + continue; // J is a candidate for merging with I. if (PairableInsts.empty() || - PairableInsts[PairableInsts.size()-1] != I) { - PairableInsts.push_back(I); + PairableInsts[PairableInsts.size() - 1] != &*I) { + PairableInsts.push_back(&*I); } - CandidatePairs[I].push_back(J); + CandidatePairs[&*I].push_back(&*J); ++TotalPairs; if (TTI) - CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), - CostSavings)); + CandidatePairCostSavings.insert( + ValuePairWithCost(ValuePair(&*I, &*J), CostSavings)); if (FixedOrder == 1) - FixedOrderPairs.insert(ValuePair(I, J)); + FixedOrderPairs.insert(ValuePair(&*I, &*J)); else if (FixedOrder == -1) - FixedOrderPairs.insert(ValuePair(J, I)); + FixedOrderPairs.insert(ValuePair(&*J, &*I)); // The next call to this function must start after the last instruction // selected during this invocation. @@ -1468,14 +1480,16 @@ namespace { BasicBlock::iterator E = BB.end(), EL = BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { - if (IsInPair.find(I) == IsInPair.end()) continue; + if (IsInPair.find(&*I) == IsInPair.end()) + continue; DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); + if (I->mayWriteToMemory()) + WriteSet.add(&*I); for (BasicBlock::iterator J = std::next(I); J != E; ++J) { - (void) trackUsesOfI(Users, WriteSet, I, J); + (void)trackUsesOfI(Users, WriteSet, &*I, &*J); if (J == EL) break; @@ -1484,7 +1498,7 @@ namespace { for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); U != E; ++U) { if (IsInPair.find(*U) == IsInPair.end()) continue; - PairableInstUsers.insert(ValuePair(I, *U)); + PairableInstUsers.insert(ValuePair(&*I, *U)); } if (I == EL) @@ -2806,55 +2820,51 @@ namespace { Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, Instruction *&K2) { - if (isa<StoreInst>(I)) { - AA->replaceWithNewValue(I, K); - AA->replaceWithNewValue(J, K); - } else { - Type *IType = I->getType(); - Type *JType = J->getType(); + if (isa<StoreInst>(I)) + return; - VectorType *VType = getVecTypeForPair(IType, JType); - unsigned numElem = VType->getNumElements(); + Type *IType = I->getType(); + Type *JType = J->getType(); - unsigned numElemI = getNumScalarElements(IType); - unsigned numElemJ = getNumScalarElements(JType); + VectorType *VType = getVecTypeForPair(IType, JType); + unsigned numElem = VType->getNumElements(); - if (IType->isVectorTy()) { - std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); - for (unsigned v = 0; v < numElemI; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); - } + unsigned numElemI = getNumScalarElements(IType); + unsigned numElemJ = getNumScalarElements(JType); - K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( Mask1), - getReplacementName(K, false, 1)); - } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - K1 = ExtractElementInst::Create(K, CV0, - getReplacementName(K, false, 1)); + if (IType->isVectorTy()) { + std::vector<Constant *> Mask1(numElemI), Mask2(numElemI); + for (unsigned v = 0; v < numElemI; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v); } - if (JType->isVectorTy()) { - std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); - for (unsigned v = 0; v < numElemJ; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); - } + K1 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get(Mask1), + getReplacementName(K, false, 1)); + } else { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); + } - K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( Mask2), - getReplacementName(K, false, 2)); - } else { - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K2 = ExtractElementInst::Create(K, CV1, - getReplacementName(K, false, 2)); + if (JType->isVectorTy()) { + std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ); + for (unsigned v = 0; v < numElemJ; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v); } - K1->insertAfter(K); - K2->insertAfter(K1); - InsertionPt = K2; + K2 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get(Mask2), + getReplacementName(K, false, 2)); + } else { + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); + K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); } + + K1->insertAfter(K); + K2->insertAfter(K1); + InsertionPt = K2; } // Move all uses of the function I (including pairing-induced uses) after J. @@ -2869,7 +2879,7 @@ namespace { if (I->mayWriteToMemory()) WriteSet.add(I); for (; cast<Instruction>(L) != J; ++L) - (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); + (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs); assert(cast<Instruction>(L) == J && "Tracking has not proceeded far enough to check for dependencies"); @@ -2891,9 +2901,9 @@ namespace { if (I->mayWriteToMemory()) WriteSet.add(I); for (; cast<Instruction>(L) != J;) { - if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { + if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) { // Move this instruction - Instruction *InstToMove = L; ++L; + Instruction *InstToMove = &*L++; DEBUG(dbgs() << "BBV: moving: " << *InstToMove << " to after " << *InsertionPt << "\n"); @@ -2924,11 +2934,11 @@ namespace { // Note: We cannot end the loop when we reach J because J could be moved // farther down the use chain by another instruction pairing. Also, J // could be before I if this is an inverted input. - for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { - if (trackUsesOfI(Users, WriteSet, I, L)) { + for (BasicBlock::iterator E = BB.end(); L != E; ++L) { + if (trackUsesOfI(Users, WriteSet, I, &*L)) { if (L->mayReadFromMemory()) { - LoadMoveSet[L].push_back(I); - LoadMoveSetPairs.insert(ValuePair(L, I)); + LoadMoveSet[&*L].push_back(I); + LoadMoveSetPairs.insert(ValuePair(&*L, I)); } } } @@ -2991,7 +3001,7 @@ namespace { DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI); if (P == ChosenPairs.end()) { ++PI; continue; @@ -3116,12 +3126,9 @@ namespace { } else if (!isa<StoreInst>(K)) K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_fpmath - }; + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_fpmath, + LLVMContext::MD_invariant_group}; combineMetadata(K, H, KnownIDs); K->intersectOptionalDataWith(H); @@ -3145,8 +3152,6 @@ namespace { if (!isa<StoreInst>(I)) { L->replaceAllUsesWith(K1); H->replaceAllUsesWith(K2); - AA->replaceWithNewValue(L, K1); - AA->replaceWithNewValue(H, K2); } // Instructions that may read from memory may be in the load move set. @@ -3197,10 +3202,14 @@ namespace { char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 69ca2688c810..a627dd665179 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,7 +48,6 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" @@ -58,10 +57,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -99,6 +101,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> +#include <functional> #include <map> #include <tuple> @@ -123,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), "trip count that is smaller than this " "value.")); +static cl::opt<bool> MaximizeBandwidth( + "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -136,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), /// ... static cl::opt<bool> EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, - cl::desc("Enable symblic stride memory access versioning")); + cl::desc("Enable symbolic stride memory access versioning")); static cl::opt<bool> EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, @@ -214,12 +222,27 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + +static cl::opt<unsigned> VectorizeSCEVCheckThreshold( + "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( + "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed with a " + "vectorize(enable) pragma")); + namespace { // Forward declarations. +class LoopVectorizeHints; class LoopVectorizationLegality; class LoopVectorizationCostModel; -class LoopVectorizeHints; +class LoopVectorizationRequirements; /// \brief This modifies LoopAccessReport to initialize message with /// loop-vectorizer-specific part. @@ -245,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } +/// A helper function that returns GEP instruction and knows to skip a +/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination +/// pointee types of the 'bitcast' have the same size. +/// For example: +/// bitcast double** %var to i64* - can be skipped +/// bitcast double** %var to i8* - can not +static GetElementPtrInst *getGEPInstruction(Value *Ptr) { + + if (isa<GetElementPtrInst>(Ptr)) + return cast<GetElementPtrInst>(Ptr); + + if (isa<BitCastInst>(Ptr) && + isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { + Type *BitcastTy = Ptr->getType(); + Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); + if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) + return nullptr; + Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); + Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); + const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); + if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) + return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); + } + return nullptr; +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -261,25 +310,30 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { /// and reduction variables that were found to a given vectorization factor. class InnerLoopVectorizer { public: - InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), + : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *L) { + // MinimumBitWidths maps scalar integer values to the smallest bitwidth they + // can be validly truncated to. The cost model has assumed this truncation + // will happen when vectorizing. + void vectorize(LoopVectorizationLegality *L, + MapVector<Instruction*,uint64_t> MinimumBitWidths) { + MinBWs = MinimumBitWidths; Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); - // Register the new loop and update the analysis passes. - updateAnalysis(); } // Return true if any runtime check is added. @@ -302,14 +356,11 @@ protected: typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, VectorParts> EdgeMaskCache; - /// \brief Add checks for strides that were assumed to be 1. - /// - /// Returns the last check instruction and the first check instruction in the - /// pair as (first, last). - std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); - /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); + /// Create a new induction variable inside L. + PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, + Value *Step, Instruction *DL); /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); @@ -319,6 +370,9 @@ protected: /// See PR14725. void fixLCSSAPHIs(); + /// Shrinks vector element sizes based on information in "MinBWs". + void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. @@ -329,7 +383,7 @@ protected: /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); - + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -374,6 +428,23 @@ protected: /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); + /// Returns (and creates if needed) the original loop trip count. + Value *getOrCreateTripCount(Loop *NewLoop); + + /// Returns (and creates if needed) the trip count of the widened loop. + Value *getOrCreateVectorTripCount(Loop *NewLoop); + + /// Emit a bypass check to see if the trip count would overflow, or we + /// wouldn't have enough iterations to execute one vector loop. + void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if the vector trip count is nonzero. + void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if all of the SCEV assumptions we've + /// had to make are correct. + void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check any memory assumptions we may have made. + void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -416,8 +487,10 @@ protected: /// The original loop. Loop *OrigLoop; - /// Scev analysis to use. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies + /// dynamic knowledge to simplify SCEV expressions and converts them to a + /// more usable form. + PredicatedScalarEvolution &PSE; /// Loop Info. LoopInfo *LI; /// Dominator Tree. @@ -462,12 +535,21 @@ protected: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Holds the extended (to the widest induction type) start index. - Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; + /// Store instructions that should be predicated, as a pair + /// <StoreInst, Predicate> + SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores; EdgeMaskCache MaskCache; - + /// Trip count of the original loop. + Value *TripCount; + /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) + Value *VectorTripCount; + + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector<Instruction*,uint64_t> MinBWs; LoopVectorizationLegality *Legal; // Record whether runtime check is added. @@ -476,10 +558,11 @@ protected: class InnerLoopUnroller : public InnerLoopVectorizer { public: - InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -551,7 +634,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { if (Kind != LLVMContext::MD_tbaa && Kind != LLVMContext::MD_alias_scope && Kind != LLVMContext::MD_noalias && - Kind != LLVMContext::MD_fpmath) + Kind != LLVMContext::MD_fpmath && + Kind != LLVMContext::MD_nontemporal) continue; To->setMetadata(Kind, M.second); @@ -559,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { } /// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { +static void propagateMetadata(SmallVectorImpl<Value *> &To, + const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast<Instruction>(V)) propagateMetadata(I, From); @@ -699,8 +784,9 @@ private: /// between the member and the group in a map. class InterleavedAccessInfo { public: - InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) - : SE(SE), TheLoop(L), DT(DT) {} + InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, + DominatorTree *DT) + : PSE(PSE), TheLoop(L), DT(DT) {} ~InterleavedAccessInfo() { SmallSet<InterleaveGroup *, 4> DelSet; @@ -730,7 +816,11 @@ public: } private: - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. + /// Simplifies SCEV expressions in the context of existing SCEV assumptions. + /// The interleaved access analysis can also add new predicates (for example + /// by versioning strides of pointers). + PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; @@ -778,6 +868,304 @@ private: const ValueToValueMap &Strides); }; +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +/// We cannot write all values to metadata, as the mere presence of some info, +/// for example 'force', means a decision has been made. So, we need to be +/// careful NOT to add them if the user hasn't specifically asked so. +class LoopVectorizeHints { + enum HintKind { + HK_WIDTH, + HK_UNROLL, + HK_FORCE + }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char * Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char * Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) { } + + bool validate(unsigned Val) { + switch (Kind) { + case HK_WIDTH: + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + case HK_UNROLL: + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + case HK_FORCE: + return (Val <= 1); + } + return false; + } + }; + + /// Vectorization width. + Hint Width; + /// Vectorization interleave factor. + Hint Interleave; + /// Vectorization forced + Hint Force; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } + +public: + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + }; + + LoopVectorizeHints(const Loop *L, bool DisableInterleaving) + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Force("vectorize.enable", FK_Undefined, HK_FORCE), + TheLoop(L) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + + // force-vector-interleave overrides DisableInterleaving. + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; + + DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); + } + + /// Mark the loop L as already vectorized by setting the width to 1. + void setAlreadyVectorized() { + Width.Value = Interleave.Value = 1; + Hint Hints[] = {Width, Interleave}; + writeHintsToMetadata(Hints); + } + + bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { + if (getForce() == LoopVectorizeHints::FK_Disabled) { + DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), + vectorizeAnalysisPassName(), *F, + L->getStartLoc(), emitRemark()); + return false; + } + + if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { + DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), + vectorizeAnalysisPassName(), *F, + L->getStartLoc(), emitRemark()); + return false; + } + + if (getWidth() == 1 && getInterleave() == 1) { + // FIXME: Add a separate metadata to indicate when the loop has already + // been vectorized instead of setting width and count to 1. + DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + // FIXME: Add interleave.disable metadata. This will allow + // vectorize.disable to be used without disabling the pass and errors + // to differentiate between disabled vectorization and a width of 1. + emitOptimizationRemarkAnalysis( + F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), + "loop not vectorized: vectorization and interleaving are explicitly " + "disabled, or vectorize width and interleave count are both set to " + "1"); + return false; + } + + return true; + } + + /// Dumps all the hint information. + std::string emitRemark() const { + VectorizationReport R; + if (Force.Value == LoopVectorizeHints::FK_Disabled) + R << "vectorization is explicitly disabled"; + else { + R << "use -Rpass-analysis=loop-vectorize for more info"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=true"; + if (Width.Value != 0) + R << ", Vector Width=" << Width.Value; + if (Interleave.Value != 0) + R << ", Interleave Count=" << Interleave.Value; + R << ")"; + } + } + + return R.str(); + } + + unsigned getWidth() const { return Width.Value; } + unsigned getInterleave() const { return Interleave.Value; } + enum ForceKind getForce() const { return (ForceKind)Force.Value; } + const char *vectorizeAnalysisPassName() const { + // If hints are provided that don't disable vectorization use the + // AlwaysPrint pass name to force the frontend to print the diagnostic. + if (getWidth() == 1) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Disabled) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + return LV_NAME; + return DiagnosticInfo::AlwaysPrint; + } + + bool allowReordering() const { + // When enabling loop hints are provided we allow the vectorizer to change + // the order of operations that is given by the scalar loop. This is not + // enabled by default because can be unsafe or inefficient. For example, + // reordering floating-point operations will change the way round-off + // error accumulates in the loop. + return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + } + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector<Metadata *, 4> Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast<MDString>(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast<MDString>(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } + } + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); + if (!C) return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Width, &Interleave, &Force}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); + break; + } + } + } + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); + } + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { + MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; + } + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { + if (HintTypes.size() == 0) + return; + + // Reserve the first element to LoopID (see below). + SmallVector<Metadata *, 4> MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); + } + + /// The loop these hints belong to. + const Loop *TheLoop; +}; + +static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, + const LoopVectorizeHints &Hints, + const LoopAccessReport &Message) { + const char *Name = Hints.vectorizeAnalysisPassName(); + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); +} + +static void emitMissedWarning(Function *F, Loop *L, + const LoopVectorizeHints &LH) { + emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), + LH.emitRemark()); + + if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { + if (LH.getWidth() != 1) + emitLoopVectorizeWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop vectorization"); + else if (LH.getInterleave() != 1) + emitLoopInterleaveWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop interleaving"); + } +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -793,87 +1181,17 @@ private: /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, - Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA) - : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} - - /// This enum represents the kinds of inductions that we support. - enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = C. - IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). - }; - - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) - : StartValue(Start), IK(K), StepValue(Step) { - assert(IK != IK_NoInduction && "Not an induction"); - assert(StartValue && "StartValue is null"); - assert(StepValue && !StepValue->isZero() && "StepValue is zero"); - assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && - "StartValue is not a pointer for pointer induction"); - assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && - "StartValue is not an integer for integer induction"); - assert(StepValue->getType()->isIntegerTy() && - "StepValue is not an integer"); - } - InductionInfo() - : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} - - /// Get the consecutive direction. Returns: - /// 0 - unknown or non-consecutive. - /// 1 - consecutive and increasing. - /// -1 - consecutive and decreasing. - int getConsecutiveDirection() const { - if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) - return StepValue->getSExtValue(); - return 0; - } - - /// Compute the transformed value of Index at offset StartValue using step - /// StepValue. - /// For integer induction, returns StartValue + Index * StepValue. - /// For pointer induction, returns StartValue[Index * StepValue]. - /// FIXME: The newly created binary instructions should contain nsw/nuw - /// flags, which can be found from the original scalar operations. - Value *transform(IRBuilder<> &B, Value *Index) const { - switch (IK) { - case IK_IntInduction: - assert(Index->getType() == StartValue->getType() && - "Index type does not match StartValue type"); - if (StepValue->isMinusOne()) - return B.CreateSub(StartValue, Index); - if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateAdd(StartValue, Index); - - case IK_PtrInduction: - assert(Index->getType() == StepValue->getType() && - "Index type does not match StepValue type"); - if (StepValue->isMinusOne()) - Index = B.CreateNeg(Index); - else if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateGEP(nullptr, StartValue, Index); - - case IK_NoInduction: - return nullptr; - } - llvm_unreachable("invalid enum"); - } - - /// Start value. - TrackingVH<Value> StartValue; - /// Induction kind. - InductionKind IK; - /// Step value. - ConstantInt *StepValue; - }; + LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA, + LoopVectorizationRequirements *R, + const LoopVectorizeHints *H) + : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -881,7 +1199,7 @@ public: /// InductionList saves induction variables and maps them to the /// induction descriptor. - typedef MapVector<PHINode*, InductionInfo> InductionList; + typedef MapVector<PHINode*, InductionDescriptor> InductionList; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this @@ -903,6 +1221,9 @@ public: /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); + /// Returns True if PN is a reduction variable in this loop. + bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -954,12 +1275,12 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -999,10 +1320,6 @@ private: /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); - /// Returns the induction kind of Phi and record the step. This function may - /// return NoInduction if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); - /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop @@ -1013,16 +1330,20 @@ private: /// not vectorized. These are handled as LoopAccessReport rather than /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) { - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + void emitAnalysis(const LoopAccessReport &Message) const { + emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } unsigned NumPredStores; /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. + /// Applies dynamic knowledge to simplify SCEV expressions in the context + /// of existing SCEV assumptions. The analysis will also add a minimal set + /// of new predicates if this is required to enable vectorization and + /// unrolling. + PredicatedScalarEvolution &PSE; /// Target Library Info. TargetLibraryInfo *TLI; /// Parent function @@ -1065,12 +1386,18 @@ private: /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; + /// Vectorization requirements that will go through late-evaluation. + LoopVectorizationRequirements *Requirements; + + /// Used to emit an analysis of any legality issues. + const LoopVectorizeHints *Hints; + ValueToValueMap Strides; SmallPtrSet<Value *, 8> StrideSet; /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic - SmallPtrSet<const Instruction*, 8> MaskedOp; + SmallPtrSet<const Instruction *, 8> MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1082,15 +1409,14 @@ private: /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, - LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, + LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC, - const Function *F, const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), - TheFunction(F), Hints(Hints) { - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - } + const TargetLibraryInfo *TLI, DemandedBits *DB, + AssumptionCache *AC, const Function *F, + const LoopVectorizeHints *Hints) + : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + AC(AC), TheFunction(F), Hints(Hints) {} /// Information about vectorization costs struct VectorizationFactor { @@ -1103,10 +1429,10 @@ public: /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the smallest and widest types in the code + /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1133,8 +1459,13 @@ public: unsigned NumInstructions; }; - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); + /// \return Returns information about the register usages of the loop for the + /// given vectorization factors. + SmallVector<RegisterUsage, 8> + calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs); + + /// Collect values we want to ignore in the cost model. + void collectValuesToIgnore(); private: /// Returns the expected execution cost. The unit of the cost does @@ -1155,17 +1486,20 @@ private: /// not vectorized. These are handled as LoopAccessReport rather than /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) { - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + void emitAnalysis(const LoopAccessReport &Message) const { + emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } - /// Values used only by @llvm.assume calls. - SmallPtrSet<const Value *, 32> EphValues; +public: + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector<Instruction*,uint64_t> MinBWs; /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// Predicated scalar evolution analysis. + PredicatedScalarEvolution &PSE; /// Loop Info analysis. LoopInfo *LI; /// Vectorization legality. @@ -1174,247 +1508,78 @@ private: const TargetTransformInfo &TTI; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Demanded bits analysis. + DemandedBits *DB; + /// Assumption cache. + AssumptionCache *AC; const Function *TheFunction; - // Loop Vectorize Hint. + /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// Values to ignore in the cost model. + SmallPtrSet<const Value *, 16> ValuesToIgnore; + /// Values to ignore in the cost model when VF > 1. + SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { - enum HintKind { - HK_WIDTH, - HK_UNROLL, - HK_FORCE - }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char * Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char * Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) { } - - bool validate(unsigned Val) { - switch (Kind) { - case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; - case HK_FORCE: - return (Val <= 1); - } - return false; - } - }; - - /// Vectorization width. - Hint Width; - /// Vectorization interleave factor. - Hint Interleave; - /// Vectorization forced - Hint Force; - - /// Return the loop metadata prefix. - static StringRef Prefix() { return "llvm.loop."; } - +/// \brief This holds vectorization requirements that must be verified late in +/// the process. The requirements are set by legalize and costmodel. Once +/// vectorization has been determined to be possible and profitable the +/// requirements can be verified by looking for metadata or compiler options. +/// For example, some loops require FP commutativity which is only allowed if +/// vectorization is explicitly specified or if the fast-math compiler option +/// has been provided. +/// Late evaluation of these requirements allows helpful diagnostics to be +/// composed that tells the user what need to be done to vectorize the loop. For +/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late +/// evaluation should be used only when diagnostics can generated that can be +/// followed by a non-expert user. +class LoopVectorizationRequirements { public: - enum ForceKind { - FK_Undefined = -1, ///< Not selected. - FK_Disabled = 0, ///< Forcing disabled. - FK_Enabled = 1, ///< Forcing enabled. - }; - - LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, - HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), - Force("vectorize.enable", FK_Undefined, HK_FORCE), - TheLoop(L) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); - - // force-vector-interleave overrides DisableInterleaving. - if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; - - DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() - << "LV: Interleaving disabled by the pass manager\n"); - } - - /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized() { - Width.Value = Interleave.Value = 1; - Hint Hints[] = {Width, Interleave}; - writeHintsToMetadata(Hints); - } - - /// Dumps all the hint information. - std::string emitRemark() const { - VectorizationReport R; - if (Force.Value == LoopVectorizeHints::FK_Disabled) - R << "vectorization is explicitly disabled"; - else { - R << "use -Rpass-analysis=loop-vectorize for more info"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=true"; - if (Width.Value != 0) - R << ", Vector Width=" << Width.Value; - if (Interleave.Value != 0) - R << ", Interleave Count=" << Interleave.Value; - R << ")"; - } - } - - return R.str(); - } - - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector<Metadata *, 4> Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast<MDString>(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast<MDString>(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); + LoopVectorizationRequirements() + : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} + + void addUnsafeAlgebraInst(Instruction *I) { + // First unsafe algebra instruction. + if (!UnsafeAlgebraInst) + UnsafeAlgebraInst = I; + } + + void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } + + bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { + const char *Name = Hints.vectorizeAnalysisPassName(); + bool Failed = false; + if (UnsafeAlgebraInst && !Hints.allowReordering()) { + emitOptimizationRemarkAnalysisFPCommute( + F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), + VectorizationReport() << "cannot prove it is safe to reorder " + "floating-point operations"); + Failed = true; } - } - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); - if (!C) return; - unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); - break; - } + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + emitOptimizationRemarkAnalysisAliasing( + F->getContext(), Name, *F, L->getStartLoc(), + VectorizationReport() + << "cannot prove it is safe to reorder memory operations"); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Failed = true; } - } - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); + return Failed; } - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { - MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; - } - - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { - if (HintTypes.size() == 0) - return; - - // Reserve the first element to LoopID (see below). - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); - } - - /// The loop these hints belong to. - const Loop *TheLoop; +private: + unsigned NumRuntimePointerChecks; + Instruction *UnsafeAlgebraInst; }; -static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH) { - emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), LH.emitRemark()); - - if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { - if (LH.getWidth() != 1) - emitLoopVectorizeWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop vectorization"); - else if (LH.getInterleave() != 1) - emitLoopInterleaveWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop interleaving"); - } -} - static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { if (L.empty()) return V.push_back(&L); @@ -1441,6 +1606,7 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + DemandedBits *DB; AliasAnalysis *AA; AssumptionCache *AC; LoopAccessAnalysis *LAA; @@ -1450,16 +1616,17 @@ struct LoopVectorize : public FunctionPass { BlockFrequency ColdEntryFreq; bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - BFI = &getAnalysis<BlockFrequencyInfo>(); + BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); LAA = &getAnalysis<LoopAccessAnalysis>(); + DB = &getAnalysis<DemandedBits>(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1562,26 +1729,8 @@ struct LoopVectorize : public FunctionPass { // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. - if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { - DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), Hints.emitRemark()); - return false; - } - - if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { - DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), Hints.emitRemark()); - return false; - } - - if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { - DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "loop not vectorized: vector width and interleave count are " - "explicitly set to 1"); + if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { + DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -1595,15 +1744,19 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "vectorization is not beneficial and is not explicitly forced"); + emitAnalysisDiag(F, L, Hints, VectorizationReport() + << "vectorization is not beneficial " + "and is not explicitly forced"); return false; } } + PredicatedScalarEvolution PSE(*SE); + // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); + LoopVectorizationRequirements Requirements; + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1611,16 +1764,18 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F, + &Hints); + CM.collectValuesToIgnore(); // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && - F->hasFnAttribute(Attribute::OptimizeForSize); + F->optForSize(); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we - // always have a canonical loop here because we think we *can* vectoriez. + // always have a canonical loop here because we think we *can* vectorize. // FIXME: This is hidden behind a flag due to pervasive problems with // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { @@ -1630,16 +1785,17 @@ struct LoopVectorize : public FunctionPass { OptForSize = true; } - // Check the function attributes to see if implicit floats are allowed.a + // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "loop not vectorized due to NoImplicitFloat attribute"); + emitAnalysisDiag( + F, L, Hints, + VectorizationReport() + << "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints); return false; } @@ -1651,32 +1807,86 @@ struct LoopVectorize : public FunctionPass { // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); - DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + // Get user interleave count. + unsigned UserIC = Hints.getInterleave(); + + // Identify the diagnostic messages that should be produced. + std::string VecDiagMsg, IntDiagMsg; + bool VectorizeLoop = true, InterleaveLoop = true; + + if (Requirements.doesNotMeet(F, L, Hints)) { + DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); + emitMissedWarning(F, L, Hints); + return false; + } if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + VecDiagMsg = + "the cost-model indicates that vectorization is not beneficial"; + VectorizeLoop = false; + } - if (IC == 1) { - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "not beneficial to vectorize and user disabled interleaving"); - return false; - } - DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); + if (IC == 1 && UserIC <= 1) { + // Tell the user interleaving is not beneficial. + DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); + IntDiagMsg = + "the cost-model indicates that interleaving is not beneficial"; + InterleaveLoop = false; + if (UserIC == 1) + IntDiagMsg += + " and is explicitly disabled or interleave count is set to 1"; + } else if (IC > 1 && UserIC == 1) { + // Tell the user interleaving is beneficial, but it explicitly disabled. + DEBUG(dbgs() + << "LV: Interleaving is beneficial but is explicitly disabled."); + IntDiagMsg = "the cost-model indicates that interleaving is beneficial " + "but is explicitly disabled or interleave count is set to 1"; + InterleaveLoop = false; + } - // Report the unrolling decision. - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - Twine("interleaved by " + Twine(IC) + - " (vectorization not beneficial)")); + // Override IC if user provided an interleave count. + IC = UserIC > 0 ? UserIC : IC; + + // Emit diagnostic messages, if any. + const char *VAPassName = Hints.vectorizeAnalysisPassName(); + if (!VectorizeLoop && !InterleaveLoop) { + // Do not vectorize or interleaving the loop. + emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, + L->getStartLoc(), VecDiagMsg); + emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, + L->getStartLoc(), IntDiagMsg); + return false; + } else if (!VectorizeLoop && InterleaveLoop) { + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, + L->getStartLoc(), VecDiagMsg); + } else if (VectorizeLoop && !InterleaveLoop) { + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); + emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, + L->getStartLoc(), IntDiagMsg); + } else if (VectorizeLoop && InterleaveLoop) { + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + } + + if (!VectorizeLoop) { + assert(IC > 1 && "interleave count should not be 1 or 0"); + // If we decided that it is not legal to vectorize the loop then + // interleave it. + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); + Unroller.vectorize(&LVL, CM.MinBWs); - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); - Unroller.vectorize(&LVL); + emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), + Twine("interleaved loop (interleaved count: ") + + Twine(IC) + ")"); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); - LB.vectorize(&LVL); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); + LB.vectorize(&LVL, CM.MinBWs); ++LoopsVectorized; // Add metadata to disable runtime unrolling scalar loop when there's no @@ -1686,7 +1896,7 @@ struct LoopVectorize : public FunctionPass { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), Twine("vectorized loop (vectorization width: ") + Twine(VF.Width) + ", interleaved count: " + Twine(IC) + ")"); @@ -1703,16 +1913,19 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired<BlockFrequencyInfo>(); + AU.addRequired<BlockFrequencyInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<DemandedBits>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; @@ -1773,6 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); + auto *SE = PSE.getSE(); // Make sure that the pointer does not point to structs. if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; @@ -1780,11 +1994,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); if (Phi && Inductions.count(Phi)) { - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } - GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (!Gep) return 0; @@ -1802,10 +2016,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // Make sure that all of the index operands are loop invariant. for (unsigned i = 1; i < NumOperands; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } @@ -1815,14 +2029,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // operand. for (unsigned i = 0; i != NumOperands; ++i) if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; // We can emit wide load/stores only if the last non-zero index is the // induction variable. const SCEV *Last = nullptr; if (!Strides.count(Gep)) - Last = SE->getSCEV(Gep->getOperand(InductionOperand)); + Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); else { // Because of the multiplication by a stride we can have a s/zext cast. // We are going to replace this stride by 1 so the cast is safe to ignore. @@ -1833,7 +2047,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // %idxprom = zext i32 %mul to i64 << Safe cast. // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom // - Last = replaceSymbolicStrideSCEV(SE, Strides, + Last = replaceSymbolicStrideSCEV(PSE, Strides, Gep->getOperand(InductionOperand), Gep); if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) Last = @@ -2177,7 +2391,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts &Entry = WidenMap.get(Instr); // Handle consecutive loads/stores. - GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); @@ -2191,8 +2405,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Ptr = Builder.Insert(Gep2); } else if (Gep) { setDebugLocFromInst(Builder, Gep); - assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), - OrigLoop) && "Base ptr must be invariant"); + assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), + OrigLoop) && + "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; @@ -2209,7 +2424,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { assert((i == InductionOperand || - SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && + PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), + OrigLoop)) && "Must be last index or loop invariant"); VectorParts &GEPParts = getVectorValue(GepOperand); @@ -2237,14 +2453,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { - // If we store to reverse consecutive memory locations then we need + // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the @@ -2298,7 +2514,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; @@ -2318,7 +2535,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Try using previously calculated values. Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); - // If the src is an instruction that appeared earlier in the basic block + // If the src is an instruction that appeared earlier in the basic block, // then it should already be vectorized. if (SrcInst && OrigLoop->contains(SrcInst)) { assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); @@ -2343,19 +2560,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -2367,12 +2577,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, + ConstantInt::get(Cmp->getType(), 1)); } Instruction *Cloned = Instr->clone(); @@ -2396,85 +2602,223 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - ReplaceInstWithInst(IfBlock->getTerminator(), - BranchInst::Create(CondBlock, NewIfBlock, Cmp)); - IfBlock = NewIfBlock; - } + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), + Cmp)); } } } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, - Instruction *Loc) { - if (FirstInst) - return FirstInst; - if (Instruction *I = dyn_cast<Instruction>(V)) - return I->getParent() == Loc->getParent() ? I : nullptr; - return nullptr; +PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, + Value *End, Value *Step, + Instruction *DL) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + // As we're just creating this loop, it's possible no latch exists + // yet. If so, use the header as this will be a single block loop. + if (!Latch) + Latch = Header; + + IRBuilder<> Builder(&*Header->getFirstInsertionPt()); + setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); + auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); + + Builder.SetInsertPoint(Latch->getTerminator()); + + // Create i+1 and fill the PHINode. + Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Start, L->getLoopPreheader()); + Induction->addIncoming(Next, Latch); + // Create the compare. + Value *ICmp = Builder.CreateICmpEQ(Next, End); + Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); + + // Now we have two terminators. Remove the old one from the block. + Latch->getTerminator()->eraseFromParent(); + + return Induction; } -std::pair<Instruction *, Instruction *> -InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { - Instruction *tnullptr = nullptr; - if (!Legal->mustCheckStrides()) - return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); - - IRBuilder<> ChkBuilder(Loc); - - // Emit checks. - Value *Check = nullptr; - Instruction *FirstInst = nullptr; - for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), - SE = Legal->strides_end(); - SI != SE; ++SI) { - Value *Ptr = stripIntegerCast(*SI); - Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), - "stride.chk"); - // Store the first instruction we create. - FirstInst = getFirstInst(FirstInst, C, Loc); - if (Check) - Check = ChkBuilder.CreateOr(Check, C); - else - Check = C; - } +Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { + if (TripCount) + return TripCount; - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction *TheCheck = - BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(TheCheck, "stride.not.one"); - FirstInst = getFirstInst(FirstInst, TheCheck, Loc); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + // Find the loop boundaries. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); + assert(BackedgeTakenCount != SE->getCouldNotCompute() && + "Invalid loop count"); - return std::make_pair(FirstInst, TheCheck); + Type *IdxTy = Legal->getWidestInductionType(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); + BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); + + // Get the total trip count from the count by adding 1. + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, DL, "induction"); + + // Count holds the overall loop count (N). + TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + L->getLoopPreheader()->getTerminator()); + + if (TripCount->getType()->isPointerTy()) + TripCount = + CastInst::CreatePointerCast(TripCount, IdxTy, + "exitcount.ptrcnt.to.int", + L->getLoopPreheader()->getTerminator()); + + return TripCount; } +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { + if (VectorTripCount) + return VectorTripCount; + + Value *TC = getOrCreateTripCount(L); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(TC->getType(), VF * UF); + Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); + + return VectorTripCount; +} + +void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, + BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + Value *CheckMinIters = + Builder.CreateICmpULT(Count, + ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); + + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "min.iters.checked"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, CheckMinIters)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, + BasicBlock *Bypass) { + Value *TC = getOrCreateVectorTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), + "cmp.zero"); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, Cmp)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code to check that the SCEV assumptions that we made. + // We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), + "scev.check"); + Value *SCEVCheck = + Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + + if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) + if (C->isZero()) + return; + + // Create a new block containing the stride check. + BB->setName("vector.scevcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, SCEVCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + +void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + if (!MemRuntimeCheck) + return; + + // Create a new block containing the memory check. + BB->setName("vector.memcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + + void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- Back-edge taken count overflow check. + [ ] <-- loop iteration number check. / | / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | | / v || [ ] <-- vector pre header. - || | - || v - || [ ] \ - || [ ]_| <-- vector loop. - || | - | \ v - | >[ ] <--- middle-block. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. | / | | / v -|- >[ ] <--- new preheader. @@ -2498,65 +2842,16 @@ void InnerLoopVectorizer::createEmptyLoop() { // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. OldInduction = Legal->getInduction(); Type *IdxTy = Legal->getWidestInductionType(); - // Find the loop boundaries. - const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (ExitCount->getType()->getPrimitiveSizeInBits() > - IdxTy->getPrimitiveSizeInBits()) - ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - - const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(BackedgeTakeCount, - SE->getConstant(BackedgeTakeCount->getType(), 1)); - - const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); - - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - Value *BackedgeCount = - Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), - VectorPH->getTerminator()); - if (BackedgeCount->getType()->isPointerTy()) - BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, - "backedge.ptrcnt.to.int", - VectorPH->getTerminator()); - Instruction *CheckBCOverflow = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, - Constant::getAllOnesValue(BackedgeCount->getType()), - "backedge.overflow", VectorPH->getTerminator()); - - // The loop index does not have to start at Zero. Find the original start - // value from the induction PHI node. If we don't have an induction variable - // then we know that it starts at zero. - Builder.SetInsertPoint(VectorPH->getTerminator()); - Value *StartIdx = ExtendedIdx = - OldInduction - ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), - IdxTy) - : ConstantInt::get(IdxTy, 0); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - - LoopBypassBlocks.push_back(VectorPH); - // Split the single block loop into the two loop structure described above. BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); @@ -2580,118 +2875,36 @@ void InnerLoopVectorizer::createEmptyLoop() { } Lp->addBasicBlockToLoop(VecBody, *LI); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - - // Generate the induction variable. - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); - Induction = Builder.CreatePHI(IdxTy, 2, "index"); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. - BasicBlock *NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); - VectorPH = NewVectorPH; - - // This is the IR builder that we use to add all of the logic for bypassing - // the new vector loop. - IRBuilder<> BypassBuilder(VectorPH->getTerminator()); - setDebugLocFromInst(BypassBuilder, - getDebugLocFromInstOrOperands(OldInduction)); - - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - if (Count->getType() != IdxTy) { - // The exit count can be of pointer type. Convert it to the correct - // integer type. - if (ExitCount->getType()->isPointerTy()) - Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); - else - Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); - } - - // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); + // Find the loop boundaries. + Value *Count = getOrCreateTripCount(Lp); - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. - Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); - Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); - Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down"); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = - BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); - VectorPH = NewVectorPH; - - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(VectorPH->getTerminator()); - if (StrideCheck) { - AddedSafetyChecks = true; - // Create a new block containing the stride check. - VectorPH->setName("vector.stridecheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - - VectorPH = NewVectorPH; - } + emitVectorLoopEnteredCheck(Lp, ScalarPH); + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, ScalarPH); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator()); - if (MemRuntimeCheck) { - AddedSafetyChecks = true; - // Create a new block containing the memory check. - VectorPH->setName("vector.memcheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - - VectorPH = NewVectorPH; - } + emitMemRuntimeChecks(Lp, ScalarPH); + + // Generate the induction variable. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF * UF); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -2701,152 +2914,60 @@ void InnerLoopVectorizer::createEmptyLoop() { // If we come from a bypass edge then we need to start from the original // start value. - // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = nullptr; + // This variable saves the new starting index for the scalar loop. It is used + // to test if there are any tail iterations left once the vector loop has + // completed. LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - // Set builder to point to last bypass block. - BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; - LoopVectorizationLegality::InductionInfo II = I->second; - - Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); - PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", - MiddleBlock->getTerminator()); - // We might have extended the type of the induction variable but we need a - // truncated version for the scalar loop. - PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? - PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : nullptr; + InductionDescriptor II = I->second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, + "bc.resume.val", ScalarPH->getTerminator()); - BCResumeVal->addIncoming(ResumeVal, MiddleBlock); - - PHINode *BCTruncResumeVal = nullptr; + Value *EndValue; if (OrigPhi == OldInduction) { - BCTruncResumeVal = - PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", - ScalarPH->getTerminator()); - BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); - } - - Value *EndValue = nullptr; - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter. - assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - - // We have the canonical induction variable. - if (OrigPhi == OldInduction) { - // Create a truncated version of the resume value for the scalar loop, - // we might have promoted the type to a larger width. - EndValue = - BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - TruncResumeVal->addIncoming(EndValue, VecBody); - - BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; - break; - } - - // Not the canonical induction variable - add the vector loop count to the - // start value. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); + // We know what the end value is. + EndValue = CountRoundDown; + } else { + IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); + Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, + II.getStepValue()->getType(), + "cast.crd"); + EndValue = II.transform(B, CRD); EndValue->setName("ind.end"); - break; } - case LoopVectorizationLegality::IK_PtrInduction: { - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StepValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); - EndValue->setName("ptr.ind.end"); - break; - } - }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { - if (OrigPhi == OldInduction) - ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); - else - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - } - ResumeVal->addIncoming(EndValue, VecBody); + BCResumeVal->addIncoming(EndValue, MiddleBlock); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); // The old induction's phi node in the scalar body needs the truncated // value. - if (OrigPhi == OldInduction) { - BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); - } else { - BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); - } + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); } - // If we are generating a new induction variable then we also need to - // generate the code that calculates the exit value. This value is not - // simply the end of the counter because we may skip the vectorized body - // in case of a runtime check. - if (!OldInduction){ - assert(!ResumeIndex && "Unexpected resume value found"); - ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", - MiddleBlock->getTerminator()); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); - } - - // Make sure that we found the index where scalar loop needs to continue. - assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && - "Invalid resume Index"); - // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - ResumeIndex, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BranchInst::Create(ExitBlock, ScalarPH, CmpN)); - // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(StartIdx, VectorPH); - Induction->addIncoming(NextIdx, VecBody); - // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); - - // Now we have two terminators. Remove the old one from the block. - VecBody->getTerminator()->eraseFromParent(); - // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. - LoopVectorPreHeader = VectorPH; + LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; @@ -2899,7 +3020,7 @@ static void cse(SmallVector<BasicBlock *, 4> &BBs) { for (unsigned i = 0, e = BBs.size(); i != e; ++i) { BasicBlock *BB = BBs[i]; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + Instruction *In = &*I++; if (!CSEDenseMapInfo::canHandle(In)) continue; @@ -3021,6 +3142,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } +static Type *smallestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); + IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); + return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; +} +static Type *largestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); + IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); + return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; +} + +void InnerLoopVectorizer::truncateToMinimalBitwidths() { + // For every instruction `I` in MinBWs, truncate the operands, create a + // truncated version of `I` and reextend its result. InstCombine runs + // later and will remove any ext/trunc pairs. + // + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + if (I->use_empty()) + continue; + Type *OriginalTy = I->getType(); + Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), + KV.second); + Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, + OriginalTy->getVectorNumElements()); + if (TruncatedTy == OriginalTy) + continue; + + IRBuilder<> B(cast<Instruction>(I)); + auto ShrinkOperand = [&](Value *V) -> Value* { + if (auto *ZI = dyn_cast<ZExtInst>(V)) + if (ZI->getSrcTy() == TruncatedTy) + return ZI->getOperand(0); + return B.CreateZExtOrTrunc(V, TruncatedTy); + }; + + // The actual instruction modification depends on the instruction type, + // unfortunately. + Value *NewI = nullptr; + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + NewI = B.CreateBinOp(BO->getOpcode(), + ShrinkOperand(BO->getOperand(0)), + ShrinkOperand(BO->getOperand(1))); + cast<BinaryOperator>(NewI)->copyIRFlags(I); + } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) { + NewI = B.CreateICmp(CI->getPredicate(), + ShrinkOperand(CI->getOperand(0)), + ShrinkOperand(CI->getOperand(1))); + } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + NewI = B.CreateSelect(SI->getCondition(), + ShrinkOperand(SI->getTrueValue()), + ShrinkOperand(SI->getFalseValue())); + } else if (CastInst *CI = dyn_cast<CastInst>(I)) { + switch (CI->getOpcode()) { + default: llvm_unreachable("Unhandled cast!"); + case Instruction::Trunc: + NewI = ShrinkOperand(CI->getOperand(0)); + break; + case Instruction::SExt: + NewI = B.CreateSExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + case Instruction::ZExt: + NewI = B.CreateZExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + } + } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { + auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto *O0 = + B.CreateZExtOrTrunc(SI->getOperand(0), + VectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + auto *O1 = + B.CreateZExtOrTrunc(SI->getOperand(1), + VectorType::get(ScalarTruncatedTy, Elements1)); + + NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + } else if (isa<LoadInst>(I)) { + // Don't do anything with the operands, just extend the result. + continue; + } else { + llvm_unreachable("Unhandled instruction type!"); + } + + // Lastly, extend the result. + NewI->takeName(cast<Instruction>(I)); + Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); + I->replaceAllUsesWith(Res); + cast<Instruction>(I)->eraseFromParent(); + I = Res; + } + } + + // We'll have created a bunch of ZExts that are now parentless. Clean up. + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + ZExtInst *Inst = dyn_cast<ZExtInst>(I); + if (Inst && Inst->use_empty()) { + Value *NewI = Inst->getOperand(0); + Inst->eraseFromParent(); + I = NewI; + } + } + } +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -3051,6 +3283,11 @@ void InnerLoopVectorizer::vectorizeLoop() { be = DFS.endRPO(); bb != be; ++bb) vectorizeBlockInLoop(*bb, &RdxPHIsToFix); + // Insert truncates and extends for any truncated instructions as hints to + // InstCombine. + if (VF > 1) + truncateToMinimalBitwidths(); + // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did @@ -3066,7 +3303,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && + assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; @@ -3141,21 +3378,33 @@ void InnerLoopVectorizer::vectorizeLoop() { // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts; + VectorParts RdxParts = getVectorValue(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); - for (unsigned part = 0; part < UF; ++part) { - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(LoopExitInst); - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], - LoopVectorBody.back()); - RdxParts.push_back(NewPhi); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); + for (unsigned part = 0; part < UF; ++part) { + Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[part]->user_begin(); + UI != RdxParts[part]->user_end();) + if (*UI != Trunc) { + (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); + RdxParts[part] = Extnd; + } else { + ++UI; + } + } + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) + RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. @@ -3208,13 +3457,22 @@ void InnerLoopVectorizer::vectorizeLoop() { // The result is in the first element of the vector. ReducedPartRdx = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (RdxPhi->getType() != RdxDesc.getRecurrenceType()) + ReducedPartRdx = + RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType()) + : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType()); } // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable @@ -3252,6 +3510,20 @@ void InnerLoopVectorizer::vectorizeLoop() { fixLCSSAPHIs(); + // Make sure DomTree is updated. + updateAnalysis(); + + // Predicate any stores. + for (auto KV : PredicatedStores) { + BasicBlock::iterator I(KV.first); + auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT); + I->moveBefore(T); + I->getParent()->setName("pred.store.if"); + BB->setName("pred.store.continue"); + } + DEBUG(DT->verifyDomTree()); // Remove redundant induction instructions. cse(LoopVectorBody); } @@ -3326,18 +3598,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - InnerLoopVectorizer::VectorParts &Entry, - unsigned UF, unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction( + Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode* P = cast<PHINode>(PN); // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { + if (Legal->isReductionVariable(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody.back()-> getFirstInsertionPt()); + Entry[part] = PHINode::Create( + VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -3385,53 +3657,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); + InductionDescriptor II = Legal->getInductionVars()->lookup(P); // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: + switch (II.getKind()) { + case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P->getType() == II.StartValue->getType() && "Types must match"); - Type *PhiTy = P->getType(); - Value *Broadcasted; - if (P == OldInduction) { - // Handle the canonical induction variable. We might have had to - // extend the type. - Broadcasted = Builder.CreateTrunc(Induction, PhiTy); - } else { - // Handle other induction variables that are now based on the - // canonical one. - Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, - "normalized.idx"); - NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = II.transform(Builder, NormalizedIdx); - Broadcasted->setName("offset.idx"); + case InductionDescriptor::IK_IntInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + Value *V = Induction; + if (P != OldInduction) { + V = Builder.CreateSExtOrTrunc(Induction, P->getType()); + V = II.transform(Builder, V); + V->setName("offset.idx"); } - Broadcasted = getBroadcastInstrs(Broadcasted); + Value *Broadcasted = getBroadcastInstrs(V); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); + Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue()); return; } - case LoopVectorizationLegality::IK_PtrInduction: + case InductionDescriptor::IK_PtrInduction: // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = - Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); - NormalizedIdx = - Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType()); + Value *PtrInd = Induction; + PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType()); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { int EltIndex = part; - Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); Entry[part] = SclrGep; @@ -3441,8 +3704,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { int EltIndex = i + part * VF; - Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, @@ -3458,7 +3721,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); + VectorParts &Entry = WidenMap.get(&*it); + switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3466,7 +3730,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(it, Entry, UF, VF, PV); + widenPHIInstruction(&*it, Entry, UF, VF, PV); continue; }// End of PHI. @@ -3504,16 +3768,17 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Select: { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), - OrigLoop); - setDebugLocFromInst(Builder, it); + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, &*it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -3522,7 +3787,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - + Value *ScalarCond = (VF == 1) ? Cond[0] : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); @@ -3533,7 +3798,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op1[Part]); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3542,25 +3807,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast<CmpInst>(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; - if (FCmp) + if (FCmp) { C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - else + cast<FCmpInst>(C)->copyFastMathFlags(&*it); + } else { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + } Entry[Part] = C; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Store: case Instruction::Load: - vectorizeMemoryInstruction(it); + vectorizeMemoryInstruction(&*it); break; case Instruction::ZExt: case Instruction::SExt: @@ -3575,7 +3842,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast<CastInst>(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -3585,13 +3852,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - LoopVectorizationLegality::InductionInfo II = + InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction); - Constant *Step = - ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); + Constant *Step = ConstantInt::getSigned( + CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } /// Vectorize casts. @@ -3601,7 +3868,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3609,7 +3876,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(it)) break; - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast<CallInst>(it); @@ -3625,7 +3892,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -3636,7 +3903,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } @@ -3677,13 +3944,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = Builder.CreateCall(VectorF, Args); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } default: // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; }// end of switch. }// end of for_each instr. @@ -3691,7 +3958,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. - SE->forgetLoop(OrigLoop); + PSE.getSE()->forgetLoop(OrigLoop); // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && @@ -3701,19 +3968,12 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - // Due to if predication of stores we might create a sequence of "if(pred) - // a[i] = ...; " blocks. - for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { - if (i == 0) - DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - else if (isPredicatedBlock(i)) { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); - } else { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); - } - } + // We don't predicate stores by this point, so the vector body should be a + // single loop. + assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -3850,10 +4110,10 @@ bool LoopVectorizationLegality::canVectorize() { } // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); - if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(VectorizationReport() << - "could not determine number of loop iterations"); + const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); + if (ExitCount == PSE.getSE()->getCouldNotCompute()) { + emitAnalysis(VectorizationReport() + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3879,10 +4139,28 @@ bool LoopVectorizationLegality::canVectorize() { : "") << "!\n"); + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; + // Analyze interleaved memory accesses. - if (EnableInterleavedMemAccesses) + if (UseInterleaved) InterleaveInfo.analyzeInterleaving(Strides); + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + + if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { + emitAnalysis(VectorizationReport() + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); + DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + return false; + } + // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. @@ -3929,7 +4207,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, } bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); // Look for the attribute signaling the absence of NaNs. @@ -3953,7 +4230,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -3965,9 +4242,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) continue; - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value could not be identified as " "an induction or reduction variable"); return false; @@ -3975,19 +4252,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - // This is the value coming from the preheader. - Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); - ConstantInt *StepValue = nullptr; - // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi, StepValue); - - if (IK_NoInduction != IK) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { + Inductions[Phi] = ID; // Get the widest type. if (!WidestIndTy) WidestIndTy = convertPointerToIntegerType(DL, PhiTy); @@ -3995,21 +4268,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction && StepValue->isOne()) { + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getStepValue()->isOne() && + isa<Constant>(ID.getStartValue()) && + cast<Constant>(ID.getStartValue())->isNullValue()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other - // than it is expedient). + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. if (!Induction || PhiTy == WidestIndTy) Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "use of induction value outside of the " "loop is not handled by vectorizer"); return false; @@ -4020,11 +4296,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, Reductions[Phi])) { + if (Reductions[Phi].hasUnsafeAlgebra()) + Requirements->addUnsafeAlgebraInst( + Reductions[Phi].getUnsafeAlgebraInst()); AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); continue; } - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); @@ -4039,8 +4318,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(it) << - "call instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(&*it) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4049,8 +4328,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // second argument is the same (i.e. loop invariant) if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { - if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(it) + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(VectorizationReport(&*it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -4061,7 +4341,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -4085,8 +4365,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "value cannot be used outside the loop"); return false; } @@ -4104,6 +4384,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (Induction && WidestIndTy != Induction->getType()) + Induction = nullptr; + return true; } @@ -4116,7 +4402,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); if (!Stride) return; @@ -4142,7 +4428,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { BE = TheLoop->block_end(); B != BE; ++B) for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) - if (I->getType()->isPointerTy() && isConsecutivePtr(I)) + if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); while (!Worklist.empty()) { @@ -4179,30 +4465,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - if (LAI->getNumRuntimePointerChecks() > - VectorizerParams::RuntimeMemoryCheckThreshold) { - emitAnalysis(VectorizationReport() - << LAI->getNumRuntimePointerChecks() << " exceeds limit of " - << VectorizerParams::RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - return false; - } - return true; -} + Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); + PSE.addPredicate(LAI->PSE.getUnionPredicate()); -LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi, - ConstantInt *&StepValue) { - if (!isInductionPHI(Phi, SE, StepValue)) - return IK_NoInduction; - - Type *PhiTy = Phi->getType(); - // Found an Integer induction variable. - if (PhiTy->isIntegerTy()) - return IK_IntInduction; - // Found an Pointer induction variable. - return IK_PtrInduction; + return true; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -4256,8 +4522,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || !isSinglePredecessor) { - // Build a masked store if it is legal for the target, otherwise scalarize - // the block. + // Build a masked store if it is legal for the target, otherwise + // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), SI->getPointerOperand()); @@ -4315,7 +4581,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( StoreInst *SI = dyn_cast<StoreInst>(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); @@ -4324,7 +4590,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( if (Factor < 2 || Factor > MaxInterleaveGroupFactor) continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4411,12 +4677,12 @@ void InterleavedAccessInfo::analyzeInterleaving( continue; // Calculate the distance and prepare for the rule 3. - const SCEVConstant *DistToA = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + const SCEVConstant *DistToA = dyn_cast<SCEVConstant>( + PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev)); if (!DistToA) continue; - int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + int DistanceToA = DistToA->getAPInt().getSExtValue(); // Skip if the distance is not multiple of size as they are not in the // same group. @@ -4454,8 +4720,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { emitAnalysis(VectorizationReport() << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os"); - DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); + "compiling with -Os/-Oz"); + DEBUG(dbgs() << + "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return Factor; } @@ -4467,10 +4734,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { } // Find the trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop); + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4478,7 +4747,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + + DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " + << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4491,6 +4762,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { " into one vector!"); unsigned VF = MaxVectorSize; + if (MaximizeBandwidth && !OptForSize) { + // Collect all viable vectorization factors. + SmallVector<unsigned, 8> VFs; + unsigned NewMaxVectorSize = WidestRegister / SmallestType; + for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + VFs.push_back(VS); + + // For each VF calculate its register usage. + auto RUs = calculateRegisterUsage(VFs); + + // Select the largest VF which doesn't require more registers than existing + // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + for (int i = RUs.size() - 1; i >= 0; --i) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + VF = VFs[i]; + break; + } + } + } // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4499,7 +4790,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { emitAnalysis (VectorizationReport() << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } @@ -4515,8 +4806,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { "cannot optimize for size and vectorize at the " "same time. Enable vectorization of this loop " "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + "when compiling with -Os/-Oz"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } } @@ -4566,7 +4857,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair<unsigned, unsigned> +LoopVectorizationCostModel::getSmallestAndWidestTypes() { + unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4579,18 +4872,22 @@ unsigned LoopVectorizationCostModel::getWidestType() { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; // Only examine Loads, Stores and PHINodes. if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) continue; - // Examine PHI nodes that are reduction variables. - if (PHINode *PN = dyn_cast<PHINode>(it)) - if (!Legal->getReductionVars()->count(PN)) + // Examine PHI nodes that are reduction variables. Update the type to + // account for the recurrence type. + if (PHINode *PN = dyn_cast<PHINode>(it)) { + if (!Legal->isReductionVariable(PN)) continue; + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; + T = RdxDesc.getRecurrenceType(); + } // Examine the stored values. if (StoreInst *ST = dyn_cast<StoreInst>(it)) @@ -4599,15 +4896,17 @@ unsigned LoopVectorizationCostModel::getWidestType() { // Ignore loaded pointer types and stored pointer types that are not // consecutive. However, we do want to take consecutive stores/loads of // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; + MinWidth = std::min(MinWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4628,11 +4927,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // Use the user preference, unless 'auto' is selected. - int UserUF = Hints->getInterleave(); - if (UserUF != 0) - return UserUF; - // When we optimize for size, we don't interleave. if (OptForSize) return 1; @@ -4642,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; // Do not interleave loops with a relatively small trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop); + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; @@ -4658,7 +4952,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, TargetNumRegisters = ForceTargetNumVectorRegs; } - LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -4756,8 +5050,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, } // Interleave if this is a large loop (small loops are already dealt with by - // this - // point) that could benefit from interleaving. + // this point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); @@ -4768,8 +5061,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> +LoopVectorizationCostModel::calculateRegisterUsage( + const SmallVector<unsigned, 8> &VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -4790,8 +5084,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() { LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage R; - R.NumInstructions = 0; + RegisterUsage RU; + RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -4810,15 +5104,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() { unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - R.NumInstructions += (*bb)->size(); - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - Instruction *I = it; - IdxToInstr[Index++] = I; + RU.NumInstructions += (*bb)->size(); + for (Instruction &I : **bb) { + IdxToInstr[Index++] = &I; // Save the end location of each USE. - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *U = I->getOperand(i); + for (unsigned i = 0; i < I.getNumOperands(); ++i) { + Value *U = I.getOperand(i); Instruction *Instr = dyn_cast<Instruction>(U); // Ignore non-instruction values such as arguments, constants, etc. @@ -4847,42 +5139,85 @@ LoopVectorizationCostModel::calculateRegisterUsage() { TransposeEnds[it->second].push_back(it->first); SmallSet<Instruction*, 8> OpenIntervals; - unsigned MaxUsage = 0; + // Get the size of the widest register. + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + unsigned WidestRegister = + std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + + SmallVector<RegisterUsage, 8> RUs(VFs.size()); + SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + // A lambda that gets the register usage for the given type and VF. + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); + return std::max<unsigned>(1, VF * TypeSize / WidestRegister); + }; + for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; - // Ignore ephemeral values. - if (EphValues.count(I)) - continue; - // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j=0, e = List.size(); j < e; ++j) + for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // Count the number of live interals. - MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + // Skip ignored values. + if (ValuesToIgnore.count(I)) + continue; - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() << '\n'); + // For each VF find the maximum usage of registers. + for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + if (VFs[j] == 1) { + MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); + continue; + } + + // Count the number of live intervals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.count(Inst)) + continue; + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + } + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + } + + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); + for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + unsigned Invariant = 0; + if (VFs[i] == 1) + Invariant = LoopInvariants.size(); + else { + for (auto Inst : LoopInvariants) + Invariant += GetRegUsage(Inst->getType(), VFs[i]); + } + + DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); - R.LoopInvariantRegs = Invariant; - R.MaxLocalUsers = MaxUsage; - return R; + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[i]; + RUs[i] = RU; + } + + return RUs; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { @@ -4900,11 +5235,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { if (isa<DbgInfoIntrinsic>(it)) continue; - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4969,7 +5304,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, if (!C) return true; - const APInt &APStepVal = C->getValue()->getValue(); + const APInt &APStepVal = C->getAPInt(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) @@ -4981,9 +5316,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) - return true; - return false; + return Legal->hasStride(I->getOperand(0)) || + Legal->hasStride(I->getOperand(1)); } unsigned @@ -4994,7 +5328,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VF = 1; Type *RetTy = I->getType(); + if (VF > 1 && MinBWs.count(I)) + RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); + auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -5076,6 +5413,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); + auto It = MinBWs.find(Op0AsInstruction); + if (VF > 1 && It != MinBWs.end()) + ValTy = IntegerType::get(ValTy->getContext(), It->second); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -5199,8 +5540,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Legal->isInductionVariable(I->getOperand(0))) return TTI.getCastInstrCost(I->getOpcode(), I->getType(), I->getOperand(0)->getType()); - - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + Type *SrcScalarTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + if (VF > 1 && MinBWs.count(I)) { + // This cast is going to be shrunk. This may remove the cast or it might + // turn it into slightly different cast. For example, if MinBW == 16, + // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". + // + // Calculate the modified src and dest types. + Type *MinVecTy = VectorTy; + if (I->getOpcode() == Instruction::Trunc) { + SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } else if (I->getOpcode() == Instruction::ZExt || + I->getOpcode() == Instruction::SExt) { + SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } + } + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { @@ -5240,15 +5601,18 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -5269,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { return false; } +void LoopVectorizationCostModel::collectValuesToIgnore() { + // Ignore ephemeral values. + CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + + // Ignore type-promoting instructions we identified during reduction + // detection. + for (auto &Reduction : *Legal->getReductionVars()) { + RecurrenceDescriptor &RedDes = Reduction.second; + SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); + VecValuesToIgnore.insert(Casts.begin(), Casts.end()); + } + + // Ignore induction phis that are only used in either GetElementPtr or ICmp + // instruction to exit loop. Induction variables usually have large types and + // can have big impact when estimating register usage. + // This is for when VF > 1. + for (auto &Induction : *Legal->getInductionVars()) { + auto *PN = Induction.first; + auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + + // Check that the PHI is only used by the induction increment (UpdateV) or + // by GEPs. Then check that UpdateV is only used by a compare instruction or + // the loop header PHI. + // FIXME: Need precise def-use analysis to determine if this instruction + // variable will be vectorized. + if (std::all_of(PN->user_begin(), PN->user_end(), + [&](const User *U) -> bool { + return U == UpdateV || isa<GetElementPtrInst>(U); + }) && + std::all_of(UpdateV->user_begin(), UpdateV->user_end(), + [&](const User *U) -> bool { + return U == PN || isa<ICmpInst>(U); + })) { + VecValuesToIgnore.insert(PN); + VecValuesToIgnore.insert(UpdateV); + } + } + + // Ignore instructions that will not be vectorized. + // This is for when VF > 1. + for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; + ++bb) { + for (auto &Inst : **bb) { + switch (Inst.getOpcode()) { + case Instruction::GetElementPtr: { + // Ignore GEP if its last operand is an induction variable so that it is + // a consecutive load/store and won't be vectorized as scatter/gather + // pattern. + + GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst); + unsigned NumOperands = Gep->getNumOperands(); + unsigned InductionOperand = getGEPInductionOperand(Gep); + bool GepToIgnore = true; + + // Check that all of the gep indices are uniform except for the + // induction operand. + for (unsigned i = 0; i != NumOperands; ++i) { + if (i != InductionOperand && + !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), + TheLoop)) { + GepToIgnore = false; + break; + } + } + + if (GepToIgnore) + VecValuesToIgnore.insert(&Inst); + break; + } + } + } + } +} void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { @@ -5316,19 +5753,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -5343,11 +5773,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], ConstantInt::get(Cond[Part]->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); @@ -5367,16 +5792,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (!IsVoidRetTy) VecResults[Part] = Cloned; - // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - ReplaceInstWithInst(IfBlock->getTerminator(), - BranchInst::Create(CondBlock, NewIfBlock, Cmp)); - IfBlock = NewIfBlock; - } + // End if-block. + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), + Cmp)); } } diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b180c976c233..9ed44d1e0cb8 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" @@ -61,7 +62,7 @@ static cl::opt<int> "number ")); static cl::opt<bool> -ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, +ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); static cl::opt<bool> ShouldStartVectorizeHorAtStore( @@ -73,6 +74,14 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +/// Limits the size of scheduling regions in a block. +/// It avoid long compile times for _very_ large blocks where vector +/// instructions are spread over a wide range. +/// This limit is way higher than needed by real-world functions. +static cl::opt<int> +ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, + cl::desc("Limit the size of the SLP scheduling region per block")); + namespace { // FIXME: Set this via cl::opt to allow overriding. @@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10; // This limit is useful for very large basic blocks. static const unsigned MaxMemDepDistance = 160; +/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling +/// regions to be handled. +static const int MinScheduleRegionSize = 16; + /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -156,13 +169,11 @@ static unsigned getAltOpcode(unsigned Op) { /// of an alternate sequence which can later be merged as /// a ShuffleVector instruction. static bool canCombineAsAltInst(unsigned Op) { - if (Op == Instruction::FAdd || Op == Instruction::FSub || - Op == Instruction::Sub || Op == Instruction::Add) - return true; - return false; + return Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add; } -/// \returns ShuffleVector instruction if intructions in \p VL have +/// \returns ShuffleVector instruction if instructions in \p VL have /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) static unsigned isAltInst(ArrayRef<Value *> VL) { @@ -242,6 +253,9 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { case LLVMContext::MD_fpmath: MD = MDNode::getMostGenericFPMath(MD, IMD); break; + case LLVMContext::MD_nontemporal: + MD = MDNode::intersect(MD, IMD); + break; } } I->setMetadata(Kind, MD); @@ -393,7 +407,7 @@ public: /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns true if it is benefitial to reverse the vector order. + /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } @@ -441,7 +455,7 @@ private: /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); - /// \returns whether the VectorizableTree is fully vectoriable and will + /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); @@ -506,7 +520,7 @@ private: /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser (Value *S, llvm::User *U, int L) : - Scalar(S), User(U), Lane(L){}; + Scalar(S), User(U), Lane(L){} // Which scalar in our function. Value *Scalar; // Which user that uses the scalar. @@ -717,6 +731,8 @@ private: : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), ScheduleStart(nullptr), ScheduleEnd(nullptr), FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), + ScheduleRegionSize(0), + ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), // Make sure that the initial SchedulingRegionID is greater than the // initial SchedulingRegionID in ScheduleData (which is 0). SchedulingRegionID(1) {} @@ -728,6 +744,13 @@ private: FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + // Reduce the maximum schedule region size by the size of the + // previous scheduling run. + ScheduleRegionSizeLimit -= ScheduleRegionSize; + if (ScheduleRegionSizeLimit < MinScheduleRegionSize) + ScheduleRegionSizeLimit = MinScheduleRegionSize; + ScheduleRegionSize = 0; + // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; @@ -804,7 +827,8 @@ private: void cancelScheduling(ArrayRef<Value *> VL); /// Extends the scheduling region so that V is inside the region. - void extendSchedulingRegion(Value *V); + /// \returns true if the region size is within the limit. + bool extendSchedulingRegion(Value *V); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -858,6 +882,12 @@ private: /// (can be null). ScheduleData *LastLoadStoreInRegion; + /// The current size of the scheduling region. + int ScheduleRegionSize; + + /// The maximum size allowed for the scheduling region. + int ScheduleRegionSizeLimit; + /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. int SchedulingRegionID; @@ -1077,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - BS.cancelScheduling(VL); + assert((!BS.getScheduleData(VL[0]) || + !BS.getScheduleData(VL[0])->isPartOfBundle()) && + "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, false); return; } @@ -1125,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { return; } case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. + // For example we don't want vectorize loads that are smaller than 8 bit. + // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats + // loading/storing it as an i8 struct. If we vectorize loads/stores from + // such a struct we read/write packed bits disagreeing with the + // unvectorized version. + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *ScalarTy = VL[0]->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != + DL.getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + return; + } // Check if the loads are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast<LoadInst>(VL[i]); @@ -1134,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - const DataLayout &DL = F->getParent()->getDataLayout(); + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; @@ -1690,7 +1739,8 @@ int BoUpSLP::getSpillCost() { } // Now find the sequence of instructions between PrevInst and Inst. - BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); + BasicBlock::reverse_iterator InstIt(Inst->getIterator()), + PrevInstIt(PrevInst->getIterator()); --PrevInstIt; while (InstIt != PrevInstIt) { if (PrevInstIt == PrevInst->getParent()->rend()) { @@ -1890,106 +1940,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, } } +// Return true if I should be commuted before adding it's left and right +// operands to the arrays Left and Right. +// +// The vectorizer is trying to either have all elements one side being +// instruction with the same opcode to enable further vectorization, or having +// a splat to lower the vectorizing cost. +static bool shouldReorderOperands(int i, Instruction &I, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + bool AllSameOpcodeLeft, + bool AllSameOpcodeRight, bool SplatLeft, + bool SplatRight) { + Value *VLeft = I.getOperand(0); + Value *VRight = I.getOperand(1); + // If we have "SplatRight", try to see if commuting is needed to preserve it. + if (SplatRight) { + if (VRight == Right[i - 1]) + // Preserve SplatRight + return false; + if (VLeft == Right[i - 1]) { + // Commuting would preserve SplatRight, but we don't want to break + // SplatLeft either, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (SplatLeft && VLeft == Left[i - 1]) + return false; + return true; + } + } + // Symmetrically handle Right side. + if (SplatLeft) { + if (VLeft == Left[i - 1]) + // Preserve SplatLeft + return false; + if (VRight == Left[i - 1]) + return true; + } + + Instruction *ILeft = dyn_cast<Instruction>(VLeft); + Instruction *IRight = dyn_cast<Instruction>(VRight); + + // If we have "AllSameOpcodeRight", try to see if the left operands preserves + // it and not the right, in this case we want to commute. + if (AllSameOpcodeRight) { + unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); + if (IRight && RightPrevOpcode == IRight->getOpcode()) + // Do not commute, a match on the right preserves AllSameOpcodeRight + return false; + if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { + // We have a match and may want to commute, but first check if there is + // not also a match on the existing operands on the Left to preserve + // AllSameOpcodeLeft, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (AllSameOpcodeLeft && ILeft && + cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) + return false; + return true; + } + } + // Symmetrically handle Left side. + if (AllSameOpcodeLeft) { + unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); + if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) + return false; + if (IRight && LeftPrevOpcode == IRight->getOpcode()) + return true; + } + return false; +} + void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right) { - SmallVector<Value *, 16> OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); - - OrigLeft.push_back(VLeft); - OrigRight.push_back(VRight); - - Instruction *ILeft = dyn_cast<Instruction>(VLeft); - Instruction *IRight = dyn_cast<Instruction>(VRight); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - if (i && AllSameOpcodeLeft && ILeft) { - if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { - if (PLeft->getOpcode() != ILeft->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight && IRight) { - if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { - if (PRight->getOpcode() != IRight->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (ILeft && IRight) { - if (!i && ILeft->getOpcode() > IRight->getOpcode()) { - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() > IRight->getOpcode() && - Right[i - 1] != IRight) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() == IRight->getOpcode() && - Right[i - 1] == ILeft) { - // Try preserve broadcasts. - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() == IRight->getOpcode() && - Left[i - 1] == IRight) { - // Try preserve broadcasts. - Left.push_back(IRight); - Right.push_back(ILeft); - } else { - Left.push_back(ILeft); - Right.push_back(IRight); - } - continue; - } - // One opcode, put the instruction on the right. - if (ILeft) { - Left.push_back(VRight); - Right.push_back(ILeft); - continue; - } + if (VL.size()) { + // Peel the first iteration out of the loop since there's nothing + // interesting to do anyway and it simplifies the checks in the loop. + auto VLeft = cast<Instruction>(VL[0])->getOperand(0); + auto VRight = cast<Instruction>(VL[0])->getOperand(1); + if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) + // Favor having instruction to the right. FIXME: why? + std::swap(VLeft, VRight); Left.push_back(VLeft); Right.push_back(VRight); } - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // If operands end up being broadcast return this operand order. - if (LeftBroadcast || RightBroadcast) - return; + // Keep track if we have instructions with all the same opcode on one side. + bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); + bool AllSameOpcodeRight = isa<Instruction>(Right[0]); + // Keep track if we have one side with all the same value (broadcast). + bool SplatLeft = true; + bool SplatRight = true; - // Don't reorder if the operands where good to begin. - if (AllSameOpcodeRight || AllSameOpcodeLeft) { - Left = OrigLeft; - Right = OrigRight; + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + assert(I->isCommutative() && "Can only process commutative instruction"); + // Commute to favor either a splat or maximizing having the same opcodes on + // one side. + if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight)) { + Left.push_back(I->getOperand(1)); + Right.push_back(I->getOperand(0)); + } else { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } + // Update Splat* and AllSameOpcode* after the insertion. + SplatRight = SplatRight && (Right[i - 1] == Right[i]); + SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); + AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && + (cast<Instruction>(Left[i - 1])->getOpcode() == + cast<Instruction>(Left[i])->getOpcode()); + AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && + (cast<Instruction>(Right[i - 1])->getOpcode() == + cast<Instruction>(Right[i])->getOpcode()); } + // If one operand end up being broadcast, return this operand order. + if (SplatRight || SplatLeft) + return; + const DataLayout &DL = F->getParent()->getDataLayout(); // Finally check if we can get longer vectorizable chain by reordering @@ -2030,7 +2100,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { Instruction *VL0 = cast<Instruction>(VL[0]); - BasicBlock::iterator NextInst = VL0; + BasicBlock::iterator NextInst(VL0); ++NextInst; Builder.SetInsertPoint(VL0->getParent(), NextInst); Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); @@ -2487,7 +2557,7 @@ Value *BoUpSLP::vectorizeTree() { scheduleBlock(BSIter.second.get()); } - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); vectorizeTree(&VectorizableTree[0]); DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2532,7 +2602,7 @@ Value *BoUpSLP::vectorizeTree() { User->replaceUsesOfWith(Scalar, Ex); } } else { - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); @@ -2641,7 +2711,7 @@ void BoUpSLP::optimizeGatherSequence() { BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { - Instruction *In = it++; + Instruction *In = &*it++; if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) continue; @@ -2681,8 +2751,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, ScheduleData *Bundle = nullptr; bool ReSchedule = false; DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + + // Make sure that the scheduling region contains all + // instructions of the bundle. + for (Value *V : VL) { + if (!extendSchedulingRegion(V)) + return false; + } + for (Value *V : VL) { - extendSchedulingRegion(V); ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -2743,7 +2820,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, schedule(pickedSD, ReadyInsts); } } - return Bundle->isReady(); + if (!Bundle->isReady()) { + cancelScheduling(VL); + return false; + } + return true; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { @@ -2772,9 +2853,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { } } -void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (getScheduleData(V)) - return; + return true; Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); @@ -2785,21 +2866,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); - return; + return true; } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. - BasicBlock::reverse_iterator UpIter(ScheduleStart); + BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter(ScheduleEnd); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { + if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { + DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); + return false; + } + if (UpIter != UpperEnd) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); - return; + return true; } UpIter++; } @@ -2810,13 +2896,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); - return; + return true; } DownIter++; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); } + return true; } void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, @@ -2896,8 +2983,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } else { // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and eventally - // disable vectorization. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. BundleMember->Dependencies++; BundleMember->incrementUnscheduledDeps(1); } @@ -3003,7 +3090,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { }; std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; - // Ensure that all depencency data is updated and fill the ready-list with + // Ensure that all dependency data is updated and fill the ready-list with // initial instructions. int Idx = 0; int NumToSchedule = 0; @@ -3035,7 +3122,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { Instruction *pickedInst = BundleMember->Inst; if (LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst, pickedInst); + BS->BB->getInstList().insert(LastScheduledInst->getIterator(), + pickedInst); } LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; @@ -3074,11 +3162,11 @@ struct SLPVectorizer : public FunctionPass { if (skipOptnoneFunction(F)) return false; - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -3139,13 +3227,15 @@ struct SLPVectorizer : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<ScalarEvolution>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } @@ -3260,15 +3350,26 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, // Do a quadratic search on all of the given stores and find // all of the pairs of stores that follow each other. + SmallVector<unsigned, 16> IndexQueue; for (unsigned i = 0, e = Stores.size(); i < e; ++i) { - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); - if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { - Tails.insert(Stores[j]); + const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find slp vectorization opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { + Tails.insert(Stores[k]); Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; + ConsecutiveChain[Stores[i]] = Stores[k]; + break; } } } @@ -3428,7 +3529,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned VecIdx = 0; for (auto &V : BuildVectorSlice) { IRBuilder<true, NoFolder> Builder( - ++BasicBlock::iterator(InsertAfter)); + InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); InsertElementInst *IE = cast<InsertElementInst>(V); Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); @@ -3552,16 +3653,17 @@ class HorizontalReduction { unsigned ReductionOpcode; /// The opcode of the values we perform a reduction on. unsigned ReducedValueOpcode; - /// The width of one full horizontal reduction operation. - unsigned ReduxWidth; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. bool IsPairwiseReduction; public: + /// The width of one full horizontal reduction operation. + unsigned ReduxWidth; + HorizontalReduction() : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), - ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { @@ -3607,11 +3709,11 @@ public: return false; // Post order traverse the reduction tree starting at B. We only handle true - // trees containing only binary operators. - SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; + // trees containing only binary operators or selects. + SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; Stack.push_back(std::make_pair(B, 0)); while (!Stack.empty()) { - BinaryOperator *TreeN = Stack.back().first; + Instruction *TreeN = Stack.back().first; unsigned EdgeToVist = Stack.back().second++; bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; @@ -3647,9 +3749,10 @@ public: // Visit left or right. Value *NextV = TreeN->getOperand(EdgeToVist); - BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); - if (Next) - Stack.push_back(std::make_pair(Next, 0)); + // We currently only allow BinaryOperator's and SelectInst's as reduction + // values in our tree. + if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV)) + Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0)); else if (NextV != Phi) return false; } @@ -3717,9 +3820,12 @@ public: return VectorizedTree != nullptr; } -private: + unsigned numReductionValues() const { + return ReducedVals.size(); + } - /// \brief Calcuate the cost of a reduction. +private: + /// \brief Calculate the cost of a reduction. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); @@ -3825,6 +3931,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } +/// \brief Try and get a reduction value from a phi node. +/// +/// Given a phi node \p P in a block \p ParentBB, consider possible reductions +/// if they come from either \p ParentBB or a containing loop latch. +/// +/// \returns A candidate reduction value if possible, or \code nullptr \endcode +/// if not possible. +static Value *getReductionValue(const DominatorTree *DT, PHINode *P, + BasicBlock *ParentBB, LoopInfo *LI) { + // There are situations where the reduction value is not dominated by the + // reduction phi. Vectorizing such cases has been reported to cause + // miscompiles. See PR25787. + auto DominatedReduxValue = [&](Value *R) { + return ( + dyn_cast<Instruction>(R) && + DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); + }; + + Value *Rdx = nullptr; + + // Return the incoming value if it comes from the same BB as the phi node. + if (P->getIncomingBlock(0) == ParentBB) { + Rdx = P->getIncomingValue(0); + } else if (P->getIncomingBlock(1) == ParentBB) { + Rdx = P->getIncomingValue(1); + } + + if (Rdx && DominatedReduxValue(Rdx)) + return Rdx; + + // Otherwise, check whether we have a loop latch to look at. + Loop *BBL = LI->getLoopFor(ParentBB); + if (!BBL) + return nullptr; + BasicBlock *BBLatch = BBL->getLoopLatch(); + if (!BBLatch) + return nullptr; + + // There is a loop latch, return the incoming value if it comes from + // that. This reduction pattern occassionaly turns up. + if (P->getIncomingBlock(0) == BBLatch) { + Rdx = P->getIncomingValue(0); + } else if (P->getIncomingBlock(1) == BBLatch) { + Rdx = P->getIncomingValue(1); + } + + if (Rdx && DominatedReduxValue(Rdx)) + return Rdx; + + return nullptr; +} + +/// \brief Attempt to reduce a horizontal reduction. +/// If it is legal to match a horizontal reduction feeding +/// the phi node P with reduction operators BI, then check if it +/// can be done. +/// \returns true if a horizontal reduction was matched and reduced. +/// \returns false if a horizontal reduction was not matched. +static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, + BoUpSLP &R, TargetTransformInfo *TTI) { + if (!ShouldVectorizeHor) + return false; + + HorizontalReduction HorRdx; + if (!HorRdx.matchAssociativeReduction(P, BI)) + return false; + + // If there is a sufficient number of reduction values, reduce + // to a nearby power-of-2. Can safely generate oversized + // vectors and rely on the backend to split them to legal sizes. + HorRdx.ReduxWidth = + std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + + return HorRdx.tryToReduce(R, TTI); +} + bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; @@ -3881,7 +4063,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(it).second) + if (!VisitedInstrs.insert(&*it).second) continue; if (isa<DbgInfoIntrinsic>(it)) @@ -3892,20 +4074,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Check that the PHI is a reduction PHI. if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = - (P->getIncomingBlock(0) == BB - ? (P->getIncomingValue(0)) - : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) - : nullptr)); + + Value *Rdx = getReductionValue(DT, P, BB, LI); + // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) continue; // Try to match and vectorize a horizontal reduction. - HorizontalReduction HorRdx; - if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && - HorRdx.tryToReduce(R, TTI)) { + if (canMatchHorizontalReduction(P, BI, R, TTI)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -3928,15 +4106,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - // Try to vectorize horizontal reductions feeding into a store. if (ShouldStartVectorizeHorAtStore) if (StoreInst *SI = dyn_cast<StoreInst>(it)) if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { - HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && - HorRdx.tryToReduce(R, TTI)) || - tryToVectorize(BinOp, R))) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) || + tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4037,10 +4212,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) |