diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
40 files changed, 6941 insertions, 4796 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index a3eb07a9f6d6..1a3a4aadce6a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -14,20 +14,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "adce" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/InstIterator.h" using namespace llvm; +#define DEBUG_TYPE "adce" + STATISTIC(NumRemoved, "Number of instructions removed"); namespace { @@ -37,9 +38,9 @@ namespace { initializeADCEPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function& F); + bool runOnFunction(Function& F) override; - virtual void getAnalysisUsage(AnalysisUsage& AU) const { + void getAnalysisUsage(AnalysisUsage& AU) const override { AU.setPreservesCFG(); } @@ -50,6 +51,9 @@ char ADCE::ID = 0; INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) bool ADCE::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + SmallPtrSet<Instruction*, 128> alive; SmallVector<Instruction*, 128> worklist; diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp deleted file mode 100644 index 007e9b79e20a..000000000000 --- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ /dev/null @@ -1,2002 +0,0 @@ -//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass munges the code in the input function to better prepare it for -// SelectionDAG-based code generation. This works around limitations in it's -// basic-block-at-a-time approach. It should eventually be removed. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "codegenprepare" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/ValueMap.h" -#include "llvm/Analysis/DominatorInternals.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Assembly/Writer.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/PatternMatch.h" -#include "llvm/Support/ValueHandle.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/Transforms/Utils/BypassSlowDivision.h" -#include "llvm/Transforms/Utils/Local.h" -using namespace llvm; -using namespace llvm::PatternMatch; - -STATISTIC(NumBlocksElim, "Number of blocks eliminated"); -STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); -STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); -STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " - "sunken Cmps"); -STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " - "of sunken Casts"); -STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " - "computations were sunk"); -STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); -STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); -STATISTIC(NumRetsDup, "Number of return instructions duplicated"); -STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); -STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); - -static cl::opt<bool> DisableBranchOpts( - "disable-cgp-branch-opts", cl::Hidden, cl::init(false), - cl::desc("Disable branch optimizations in CodeGenPrepare")); - -static cl::opt<bool> DisableSelectToBranch( - "disable-cgp-select2branch", cl::Hidden, cl::init(false), - cl::desc("Disable select to branch conversion.")); - -namespace { - class CodeGenPrepare : public FunctionPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetMachine *TM; - const TargetLowering *TLI; - const TargetLibraryInfo *TLInfo; - DominatorTree *DT; - - /// CurInstIterator - As we scan instructions optimizing them, this is the - /// next instruction to optimize. Xforms that can invalidate this should - /// update it. - BasicBlock::iterator CurInstIterator; - - /// Keeps track of non-local addresses that have been sunk into a block. - /// This allows us to avoid inserting duplicate code for blocks with - /// multiple load/stores of the same address. - ValueMap<Value*, Value*> SunkAddrs; - - /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to - /// be updated. - bool ModifiedDT; - - /// OptSize - True if optimizing for size. - bool OptSize; - - public: - static char ID; // Pass identification, replacement for typeid - explicit CodeGenPrepare(const TargetMachine *TM = 0) - : FunctionPass(ID), TM(TM), TLI(0) { - initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F); - - const char *getPassName() const { return "CodeGen Prepare"; } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<DominatorTree>(); - AU.addRequired<TargetLibraryInfo>(); - } - - private: - bool EliminateFallThrough(Function &F); - bool EliminateMostlyEmptyBlocks(Function &F); - bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; - void EliminateMostlyEmptyBlock(BasicBlock *BB); - bool OptimizeBlock(BasicBlock &BB); - bool OptimizeInst(Instruction *I); - bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); - bool OptimizeInlineAsmInst(CallInst *CS); - bool OptimizeCallInst(CallInst *CI); - bool MoveExtToFormExtLoad(Instruction *I); - bool OptimizeExtUses(Instruction *I); - bool OptimizeSelectInst(SelectInst *SI); - bool DupRetToEnableTailCallOpts(BasicBlock *BB); - bool PlaceDbgValues(Function &F); - }; -} - -char CodeGenPrepare::ID = 0; -INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) - -FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { - return new CodeGenPrepare(TM); -} - -bool CodeGenPrepare::runOnFunction(Function &F) { - bool EverMadeChange = false; - - ModifiedDT = false; - if (TM) TLI = TM->getTargetLowering(); - TLInfo = &getAnalysis<TargetLibraryInfo>(); - DT = getAnalysisIfAvailable<DominatorTree>(); - OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize); - - /// This optimization identifies DIV instructions that can be - /// profitably bypassed and carried out with a shorter, faster divide. - if (!OptSize && TLI && TLI->isSlowDivBypassed()) { - const DenseMap<unsigned int, unsigned int> &BypassWidths = - TLI->getBypassSlowDivWidths(); - for (Function::iterator I = F.begin(); I != F.end(); I++) - EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); - } - - // Eliminate blocks that contain only PHI nodes and an - // unconditional branch. - EverMadeChange |= EliminateMostlyEmptyBlocks(F); - - // llvm.dbg.value is far away from the value then iSel may not be able - // handle it properly. iSel will drop llvm.dbg.value if it can not - // find a node corresponding to the value. - EverMadeChange |= PlaceDbgValues(F); - - bool MadeChange = true; - while (MadeChange) { - MadeChange = false; - for (Function::iterator I = F.begin(); I != F.end(); ) { - BasicBlock *BB = I++; - MadeChange |= OptimizeBlock(*BB); - } - EverMadeChange |= MadeChange; - } - - SunkAddrs.clear(); - - if (!DisableBranchOpts) { - MadeChange = false; - SmallPtrSet<BasicBlock*, 8> WorkList; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); - MadeChange |= ConstantFoldTerminator(BB, true); - if (!MadeChange) continue; - - for (SmallVectorImpl<BasicBlock*>::iterator - II = Successors.begin(), IE = Successors.end(); II != IE; ++II) - if (pred_begin(*II) == pred_end(*II)) - WorkList.insert(*II); - } - - // Delete the dead blocks and any of their dead successors. - MadeChange |= !WorkList.empty(); - while (!WorkList.empty()) { - BasicBlock *BB = *WorkList.begin(); - WorkList.erase(BB); - SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); - - DeleteDeadBlock(BB); - - for (SmallVectorImpl<BasicBlock*>::iterator - II = Successors.begin(), IE = Successors.end(); II != IE; ++II) - if (pred_begin(*II) == pred_end(*II)) - WorkList.insert(*II); - } - - // Merge pairs of basic blocks with unconditional branches, connected by - // a single edge. - if (EverMadeChange || MadeChange) - MadeChange |= EliminateFallThrough(F); - - if (MadeChange) - ModifiedDT = true; - EverMadeChange |= MadeChange; - } - - if (ModifiedDT && DT) - DT->DT->recalculate(F); - - return EverMadeChange; -} - -/// EliminateFallThrough - Merge basic blocks which are connected -/// by a single edge, where one of the basic blocks has a single successor -/// pointing to the other basic block, which has a single predecessor. -bool CodeGenPrepare::EliminateFallThrough(Function &F) { - bool Changed = false; - // Scan all of the blocks in the function, except for the entry block. - for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { - BasicBlock *BB = I++; - // If the destination block has a single pred, then this is a trivial - // edge, just collapse it. - BasicBlock *SinglePred = BB->getSinglePredecessor(); - - // Don't merge if BB's address is taken. - if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; - - BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); - if (Term && !Term->isConditional()) { - Changed = true; - DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n"); - // Remember if SinglePred was the entry block of the function. - // If so, we will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(BB, this); - - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); - - // We have erased a block. Update the iterator. - I = BB; - } - } - return Changed; -} - -/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, -/// debug info directives, and an unconditional branch. Passes before isel -/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for -/// isel. Start by eliminating these blocks so we can split them the way we -/// want them. -bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { - bool MadeChange = false; - // Note that this intentionally skips the entry block. - for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { - BasicBlock *BB = I++; - - // If this block doesn't end with an uncond branch, ignore it. - BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); - if (!BI || !BI->isUnconditional()) - continue; - - // If the instruction before the branch (skipping debug info) isn't a phi - // node, then other stuff is happening here. - BasicBlock::iterator BBI = BI; - if (BBI != BB->begin()) { - --BBI; - while (isa<DbgInfoIntrinsic>(BBI)) { - if (BBI == BB->begin()) - break; - --BBI; - } - if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)) - continue; - } - - // Do not break infinite loops. - BasicBlock *DestBB = BI->getSuccessor(0); - if (DestBB == BB) - continue; - - if (!CanMergeBlocks(BB, DestBB)) - continue; - - EliminateMostlyEmptyBlock(BB); - MadeChange = true; - } - return MadeChange; -} - -/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a -/// single uncond branch between them, and BB contains no other non-phi -/// instructions. -bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, - const BasicBlock *DestBB) const { - // We only want to eliminate blocks whose phi nodes are used by phi nodes in - // the successor. If there are more complex condition (e.g. preheaders), - // don't mess around with them. - BasicBlock::const_iterator BBI = BB->begin(); - while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { - for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end(); - UI != E; ++UI) { - const Instruction *User = cast<Instruction>(*UI); - if (User->getParent() != DestBB || !isa<PHINode>(User)) - return false; - // If User is inside DestBB block and it is a PHINode then check - // incoming value. If incoming value is not from BB then this is - // a complex condition (e.g. preheaders) we want to avoid here. - if (User->getParent() == DestBB) { - if (const PHINode *UPN = dyn_cast<PHINode>(User)) - for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { - Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); - if (Insn && Insn->getParent() == BB && - Insn->getParent() != UPN->getIncomingBlock(I)) - return false; - } - } - } - } - - // If BB and DestBB contain any common predecessors, then the phi nodes in BB - // and DestBB may have conflicting incoming values for the block. If so, we - // can't merge the block. - const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); - if (!DestBBPN) return true; // no conflict. - - // Collect the preds of BB. - SmallPtrSet<const BasicBlock*, 16> BBPreds; - if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { - // It is faster to get preds from a PHI than with pred_iterator. - for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) - BBPreds.insert(BBPN->getIncomingBlock(i)); - } else { - BBPreds.insert(pred_begin(BB), pred_end(BB)); - } - - // Walk the preds of DestBB. - for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { - BasicBlock *Pred = DestBBPN->getIncomingBlock(i); - if (BBPreds.count(Pred)) { // Common predecessor? - BBI = DestBB->begin(); - while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { - const Value *V1 = PN->getIncomingValueForBlock(Pred); - const Value *V2 = PN->getIncomingValueForBlock(BB); - - // If V2 is a phi node in BB, look up what the mapped value will be. - if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) - if (V2PN->getParent() == BB) - V2 = V2PN->getIncomingValueForBlock(Pred); - - // If there is a conflict, bail out. - if (V1 != V2) return false; - } - } - } - - return true; -} - - -/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and -/// an unconditional branch in it. -void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { - BranchInst *BI = cast<BranchInst>(BB->getTerminator()); - BasicBlock *DestBB = BI->getSuccessor(0); - - DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB); - - // If the destination block has a single pred, then this is a trivial edge, - // just collapse it. - if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { - if (SinglePred != DestBB) { - // Remember if SinglePred was the entry block of the function. If so, we - // will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(DestBB, this); - - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); - - DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); - return; - } - } - - // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB - // to handle the new incoming edges it is about to have. - PHINode *PN; - for (BasicBlock::iterator BBI = DestBB->begin(); - (PN = dyn_cast<PHINode>(BBI)); ++BBI) { - // Remove the incoming value for BB, and remember it. - Value *InVal = PN->removeIncomingValue(BB, false); - - // Two options: either the InVal is a phi node defined in BB or it is some - // value that dominates BB. - PHINode *InValPhi = dyn_cast<PHINode>(InVal); - if (InValPhi && InValPhi->getParent() == BB) { - // Add all of the input values of the input PHI as inputs of this phi. - for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) - PN->addIncoming(InValPhi->getIncomingValue(i), - InValPhi->getIncomingBlock(i)); - } else { - // Otherwise, add one instance of the dominating value for each edge that - // we will be adding. - if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { - for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) - PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); - } else { - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - PN->addIncoming(InVal, *PI); - } - } - } - - // The PHIs are now updated, change everything that refers to BB to use - // DestBB and remove BB. - BB->replaceAllUsesWith(DestBB); - if (DT && !ModifiedDT) { - BasicBlock *BBIDom = DT->getNode(BB)->getIDom()->getBlock(); - BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock(); - BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom); - DT->changeImmediateDominator(DestBB, NewIDom); - DT->eraseNode(BB); - } - BB->eraseFromParent(); - ++NumBlocksElim; - - DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); -} - -/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop -/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC), -/// sink it into user blocks to reduce the number of virtual -/// registers that must be created and coalesced. -/// -/// Return true if any changes are made. -/// -static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ - // If this is a noop copy, - EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(CI->getType()); - - // This is an fp<->int conversion? - if (SrcVT.isInteger() != DstVT.isInteger()) - return false; - - // If this is an extension, it will be a zero or sign extension, which - // isn't a noop. - if (SrcVT.bitsLT(DstVT)) return false; - - // If these values will be promoted, find out what they will be promoted - // to. This helps us consider truncates on PPC as noop copies when they - // are. - if (TLI.getTypeAction(CI->getContext(), SrcVT) == - TargetLowering::TypePromoteInteger) - SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); - if (TLI.getTypeAction(CI->getContext(), DstVT) == - TargetLowering::TypePromoteInteger) - DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); - - // If, after promotion, these are the same types, this is a noop copy. - if (SrcVT != DstVT) - return false; - - BasicBlock *DefBB = CI->getParent(); - - /// InsertedCasts - Only insert a cast in each block once. - DenseMap<BasicBlock*, CastInst*> InsertedCasts; - - bool MadeChange = false; - for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); - UI != E; ) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI); - - // Figure out which BB this cast is used in. For PHI's this is the - // appropriate predecessor block. - BasicBlock *UserBB = User->getParent(); - if (PHINode *PN = dyn_cast<PHINode>(User)) { - UserBB = PN->getIncomingBlock(UI); - } - - // Preincrement use iterator so we don't invalidate it. - ++UI; - - // If this user is in the same block as the cast, don't change the cast. - if (UserBB == DefBB) continue; - - // If we have already inserted a cast into this block, use it. - CastInst *&InsertedCast = InsertedCasts[UserBB]; - - if (!InsertedCast) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedCast = - CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", - InsertPt); - MadeChange = true; - } - - // Replace a use of the cast with a use of the new cast. - TheUse = InsertedCast; - ++NumCastUses; - } - - // If we removed all uses, nuke the cast. - if (CI->use_empty()) { - CI->eraseFromParent(); - MadeChange = true; - } - - return MadeChange; -} - -/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce -/// the number of virtual registers that must be created and coalesced. This is -/// a clear win except on targets with multiple condition code registers -/// (PowerPC), where it might lose; some adjustment may be wanted there. -/// -/// Return true if any changes are made. -static bool OptimizeCmpExpression(CmpInst *CI) { - BasicBlock *DefBB = CI->getParent(); - - /// InsertedCmp - Only insert a cmp in each block once. - DenseMap<BasicBlock*, CmpInst*> InsertedCmps; - - bool MadeChange = false; - for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); - UI != E; ) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI); - - // Preincrement use iterator so we don't invalidate it. - ++UI; - - // Don't bother for PHI nodes. - if (isa<PHINode>(User)) - continue; - - // Figure out which BB this cmp is used in. - BasicBlock *UserBB = User->getParent(); - - // If this user is in the same block as the cmp, don't change the cmp. - if (UserBB == DefBB) continue; - - // If we have already inserted a cmp into this block, use it. - CmpInst *&InsertedCmp = InsertedCmps[UserBB]; - - if (!InsertedCmp) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedCmp = - CmpInst::Create(CI->getOpcode(), - CI->getPredicate(), CI->getOperand(0), - CI->getOperand(1), "", InsertPt); - MadeChange = true; - } - - // Replace a use of the cmp with a use of the new cmp. - TheUse = InsertedCmp; - ++NumCmpUses; - } - - // If we removed all uses, nuke the cmp. - if (CI->use_empty()) - CI->eraseFromParent(); - - return MadeChange; -} - -namespace { -class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { -protected: - void replaceCall(Value *With) { - CI->replaceAllUsesWith(With); - CI->eraseFromParent(); - } - bool isFoldable(unsigned SizeCIOp, unsigned, bool) const { - if (ConstantInt *SizeCI = - dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) - return SizeCI->isAllOnesValue(); - return false; - } -}; -} // end anonymous namespace - -bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { - BasicBlock *BB = CI->getParent(); - - // Lower inline assembly if we can. - // If we found an inline asm expession, and if the target knows how to - // lower it to normal LLVM code, do so now. - if (TLI && isa<InlineAsm>(CI->getCalledValue())) { - if (TLI->ExpandInlineAsm(CI)) { - // Avoid invalidating the iterator. - CurInstIterator = BB->begin(); - // Avoid processing instructions out of order, which could cause - // reuse before a value is defined. - SunkAddrs.clear(); - return true; - } - // Sink address computing for memory operands into the block. - if (OptimizeInlineAsmInst(CI)) - return true; - } - - // Lower all uses of llvm.objectsize.* - IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); - if (II && II->getIntrinsicID() == Intrinsic::objectsize) { - bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); - Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - - // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this - // happens. - WeakVH IterHandle(CurInstIterator); - - replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0, - TLInfo, ModifiedDT ? 0 : DT); - - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - if (IterHandle != CurInstIterator) { - CurInstIterator = BB->begin(); - SunkAddrs.clear(); - } - return true; - } - - if (II && TLI) { - SmallVector<Value*, 2> PtrOps; - Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) - while (!PtrOps.empty()) - if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) - return true; - } - - // From here on out we're working with named functions. - if (CI->getCalledFunction() == 0) return false; - - // We'll need DataLayout from here on out. - const DataLayout *TD = TLI ? TLI->getDataLayout() : 0; - if (!TD) return false; - - // Lower all default uses of _chk calls. This is very similar - // to what InstCombineCalls does, but here we are only lowering calls - // that have the default "don't know" as the objectsize. Anything else - // should be left alone. - CodeGenPrepareFortifiedLibCalls Simplifier; - return Simplifier.fold(CI, TD, TLInfo); -} - -/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return -/// instructions to the predecessor to enable tail call optimizations. The -/// case it is currently looking for is: -/// @code -/// bb0: -/// %tmp0 = tail call i32 @f0() -/// br label %return -/// bb1: -/// %tmp1 = tail call i32 @f1() -/// br label %return -/// bb2: -/// %tmp2 = tail call i32 @f2() -/// br label %return -/// return: -/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] -/// ret i32 %retval -/// @endcode -/// -/// => -/// -/// @code -/// bb0: -/// %tmp0 = tail call i32 @f0() -/// ret i32 %tmp0 -/// bb1: -/// %tmp1 = tail call i32 @f1() -/// ret i32 %tmp1 -/// bb2: -/// %tmp2 = tail call i32 @f2() -/// ret i32 %tmp2 -/// @endcode -bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { - if (!TLI) - return false; - - ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()); - if (!RI) - return false; - - PHINode *PN = 0; - BitCastInst *BCI = 0; - Value *V = RI->getReturnValue(); - if (V) { - BCI = dyn_cast<BitCastInst>(V); - if (BCI) - V = BCI->getOperand(0); - - PN = dyn_cast<PHINode>(V); - if (!PN) - return false; - } - - if (PN && PN->getParent() != BB) - return false; - - // It's not safe to eliminate the sign / zero extension of the return value. - // See llvm::isInTailCallPosition(). - const Function *F = BB->getParent(); - AttributeSet CallerAttrs = F->getAttributes(); - if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || - CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) - return false; - - // Make sure there are no instructions between the PHI and return, or that the - // return is the first instruction in the block. - if (PN) { - BasicBlock::iterator BI = BB->begin(); - do { ++BI; } while (isa<DbgInfoIntrinsic>(BI)); - if (&*BI == BCI) - // Also skip over the bitcast. - ++BI; - if (&*BI != RI) - return false; - } else { - BasicBlock::iterator BI = BB->begin(); - while (isa<DbgInfoIntrinsic>(BI)) ++BI; - if (&*BI != RI) - return false; - } - - /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail - /// call. - SmallVector<CallInst*, 4> TailCalls; - if (PN) { - for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { - CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I)); - // Make sure the phi value is indeed produced by the tail call. - if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) && - TLI->mayBeEmittedAsTailCall(CI)) - TailCalls.push_back(CI); - } - } else { - SmallPtrSet<BasicBlock*, 4> VisitedBBs; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (!VisitedBBs.insert(*PI)) - continue; - - BasicBlock::InstListType &InstList = (*PI)->getInstList(); - BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); - BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); - do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI)); - if (RI == RE) - continue; - - CallInst *CI = dyn_cast<CallInst>(&*RI); - if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI)) - TailCalls.push_back(CI); - } - } - - bool Changed = false; - for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) { - CallInst *CI = TailCalls[i]; - CallSite CS(CI); - - // Conservatively require the attributes of the call to match those of the - // return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CalleeAttrs = CS.getAttributes(); - if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias) != - AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias)) - continue; - - // Make sure the call instruction is followed by an unconditional branch to - // the return block. - BasicBlock *CallBB = CI->getParent(); - BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator()); - if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB) - continue; - - // Duplicate the return into CallBB. - (void)FoldReturnIntoUncondBranch(RI, BB, CallBB); - ModifiedDT = Changed = true; - ++NumRetsDup; - } - - // If we eliminated all predecessors of the block, delete the block now. - if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) - BB->eraseFromParent(); - - return Changed; -} - -//===----------------------------------------------------------------------===// -// Memory Optimization -//===----------------------------------------------------------------------===// - -namespace { - -/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode -/// which holds actual Value*'s for register values. -struct ExtAddrMode : public TargetLowering::AddrMode { - Value *BaseReg; - Value *ScaledReg; - ExtAddrMode() : BaseReg(0), ScaledReg(0) {} - void print(raw_ostream &OS) const; - void dump() const; - - bool operator==(const ExtAddrMode& O) const { - return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && - (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && - (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale); - } -}; - -#ifndef NDEBUG -static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { - AM.print(OS); - return OS; -} -#endif - -void ExtAddrMode::print(raw_ostream &OS) const { - bool NeedPlus = false; - OS << "["; - if (BaseGV) { - OS << (NeedPlus ? " + " : "") - << "GV:"; - WriteAsOperand(OS, BaseGV, /*PrintType=*/false); - NeedPlus = true; - } - - if (BaseOffs) - OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true; - - if (BaseReg) { - OS << (NeedPlus ? " + " : "") - << "Base:"; - WriteAsOperand(OS, BaseReg, /*PrintType=*/false); - NeedPlus = true; - } - if (Scale) { - OS << (NeedPlus ? " + " : "") - << Scale << "*"; - WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); - } - - OS << ']'; -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void ExtAddrMode::dump() const { - print(dbgs()); - dbgs() << '\n'; -} -#endif - - -/// \brief A helper class for matching addressing modes. -/// -/// This encapsulates the logic for matching the target-legal addressing modes. -class AddressingModeMatcher { - SmallVectorImpl<Instruction*> &AddrModeInsts; - const TargetLowering &TLI; - - /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and - /// the memory instruction that we're computing this address for. - Type *AccessTy; - Instruction *MemoryInst; - - /// AddrMode - This is the addressing mode that we're building up. This is - /// part of the return value of this addressing mode matching stuff. - ExtAddrMode &AddrMode; - - /// IgnoreProfitability - This is set to true when we should not do - /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode - /// always returns true. - bool IgnoreProfitability; - - AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, - const TargetLowering &T, Type *AT, - Instruction *MI, ExtAddrMode &AM) - : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) { - IgnoreProfitability = false; - } -public: - - /// Match - Find the maximal addressing mode that a load/store of V can fold, - /// give an access type of AccessTy. This returns a list of involved - /// instructions in AddrModeInsts. - static ExtAddrMode Match(Value *V, Type *AccessTy, - Instruction *MemoryInst, - SmallVectorImpl<Instruction*> &AddrModeInsts, - const TargetLowering &TLI) { - ExtAddrMode Result; - - bool Success = - AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, - MemoryInst, Result).MatchAddr(V, 0); - (void)Success; assert(Success && "Couldn't select *anything*?"); - return Result; - } -private: - bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); - bool MatchAddr(Value *V, unsigned Depth); - bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth); - bool IsProfitableToFoldIntoAddressingMode(Instruction *I, - ExtAddrMode &AMBefore, - ExtAddrMode &AMAfter); - bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); -}; - -/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. -/// Return true and update AddrMode if this addr mode is legal for the target, -/// false if not. -bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, - unsigned Depth) { - // If Scale is 1, then this is the same as adding ScaleReg to the addressing - // mode. Just process that directly. - if (Scale == 1) - return MatchAddr(ScaleReg, Depth); - - // If the scale is 0, it takes nothing to add this. - if (Scale == 0) - return true; - - // If we already have a scale of this value, we can add to it, otherwise, we - // need an available scale field. - if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) - return false; - - ExtAddrMode TestAddrMode = AddrMode; - - // Add scale to turn X*4+X*3 -> X*7. This could also do things like - // [A+B + A*7] -> [B+A*8]. - TestAddrMode.Scale += Scale; - TestAddrMode.ScaledReg = ScaleReg; - - // If the new address isn't legal, bail out. - if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) - return false; - - // It was legal, so commit it. - AddrMode = TestAddrMode; - - // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now - // to see if ScaleReg is actually X+C. If so, we can turn this into adding - // X*Scale + C*Scale to addr mode. - ConstantInt *CI = 0; Value *AddLHS = 0; - if (isa<Instruction>(ScaleReg) && // not a constant expr. - match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { - TestAddrMode.ScaledReg = AddLHS; - TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; - - // If this addressing mode is legal, commit it and remember that we folded - // this instruction. - if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { - AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); - AddrMode = TestAddrMode; - return true; - } - } - - // Otherwise, not (x+c)*scale, just return what we have. - return true; -} - -/// MightBeFoldableInst - This is a little filter, which returns true if an -/// addressing computation involving I might be folded into a load/store -/// accessing it. This doesn't need to be perfect, but needs to accept at least -/// the set of instructions that MatchOperationAddr can. -static bool MightBeFoldableInst(Instruction *I) { - switch (I->getOpcode()) { - case Instruction::BitCast: - // Don't touch identity bitcasts. - if (I->getType() == I->getOperand(0)->getType()) - return false; - return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); - case Instruction::PtrToInt: - // PtrToInt is always a noop, as we know that the int type is pointer sized. - return true; - case Instruction::IntToPtr: - // We know the input is intptr_t, so this is foldable. - return true; - case Instruction::Add: - return true; - case Instruction::Mul: - case Instruction::Shl: - // Can only handle X*C and X << C. - return isa<ConstantInt>(I->getOperand(1)); - case Instruction::GetElementPtr: - return true; - default: - return false; - } -} - -/// MatchOperationAddr - Given an instruction or constant expr, see if we can -/// fold the operation into the addressing mode. If so, update the addressing -/// mode and return true, otherwise return false without modifying AddrMode. -bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, - unsigned Depth) { - // Avoid exponential behavior on extremely deep expression trees. - if (Depth >= 5) return false; - - switch (Opcode) { - case Instruction::PtrToInt: - // PtrToInt is always a noop, as we know that the int type is pointer sized. - return MatchAddr(AddrInst->getOperand(0), Depth); - case Instruction::IntToPtr: - // This inttoptr is a no-op if the integer type is pointer sized. - if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace())) - return MatchAddr(AddrInst->getOperand(0), Depth); - return false; - case Instruction::BitCast: - // BitCast is always a noop, and we can handle it as long as it is - // int->int or pointer->pointer (we don't want int<->fp or something). - if ((AddrInst->getOperand(0)->getType()->isPointerTy() || - AddrInst->getOperand(0)->getType()->isIntegerTy()) && - // Don't touch identity bitcasts. These were probably put here by LSR, - // and we don't want to mess around with them. Assume it knows what it - // is doing. - AddrInst->getOperand(0)->getType() != AddrInst->getType()) - return MatchAddr(AddrInst->getOperand(0), Depth); - return false; - case Instruction::Add: { - // Check to see if we can merge in the RHS then the LHS. If so, we win. - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - if (MatchAddr(AddrInst->getOperand(1), Depth+1) && - MatchAddr(AddrInst->getOperand(0), Depth+1)) - return true; - - // Restore the old addr mode info. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - - // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. - if (MatchAddr(AddrInst->getOperand(0), Depth+1) && - MatchAddr(AddrInst->getOperand(1), Depth+1)) - return true; - - // Otherwise we definitely can't merge the ADD in. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - break; - } - //case Instruction::Or: - // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. - //break; - case Instruction::Mul: - case Instruction::Shl: { - // Can only handle X*C and X << C. - ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); - if (!RHS) return false; - int64_t Scale = RHS->getSExtValue(); - if (Opcode == Instruction::Shl) - Scale = 1LL << Scale; - - return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); - } - case Instruction::GetElementPtr: { - // Scan the GEP. We check it if it contains constant offsets and at most - // one variable offset. - int VariableOperand = -1; - unsigned VariableScale = 0; - - int64_t ConstantOffset = 0; - const DataLayout *TD = TLI.getDataLayout(); - gep_type_iterator GTI = gep_type_begin(AddrInst); - for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { - if (StructType *STy = dyn_cast<StructType>(*GTI)) { - const StructLayout *SL = TD->getStructLayout(STy); - unsigned Idx = - cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); - ConstantOffset += SL->getElementOffset(Idx); - } else { - uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); - if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { - ConstantOffset += CI->getSExtValue()*TypeSize; - } else if (TypeSize) { // Scales of zero don't do anything. - // We only allow one variable index at the moment. - if (VariableOperand != -1) - return false; - - // Remember the variable index. - VariableOperand = i; - VariableScale = TypeSize; - } - } - } - - // A common case is for the GEP to only do a constant offset. In this case, - // just add it to the disp field and check validity. - if (VariableOperand == -1) { - AddrMode.BaseOffs += ConstantOffset; - if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ - // Check to see if we can fold the base pointer in too. - if (MatchAddr(AddrInst->getOperand(0), Depth+1)) - return true; - } - AddrMode.BaseOffs -= ConstantOffset; - return false; - } - - // Save the valid addressing mode in case we can't match. - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - - // See if the scale and offset amount is valid for this target. - AddrMode.BaseOffs += ConstantOffset; - - // Match the base operand of the GEP. - if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { - // If it couldn't be matched, just stuff the value in a register. - if (AddrMode.HasBaseReg) { - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - return false; - } - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = AddrInst->getOperand(0); - } - - // Match the remaining variable portion of the GEP. - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, - Depth)) { - // If it couldn't be matched, try stuffing the base into a register - // instead of matching it, and retrying the match of the scale. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - if (AddrMode.HasBaseReg) - return false; - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = AddrInst->getOperand(0); - AddrMode.BaseOffs += ConstantOffset; - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), - VariableScale, Depth)) { - // If even that didn't work, bail. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - return false; - } - } - - return true; - } - } - return false; -} - -/// MatchAddr - If we can, try to add the value of 'Addr' into the current -/// addressing mode. If Addr can't be added to AddrMode this returns false and -/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type -/// or intptr_t for the target. -/// -bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { - // Fold in immediates if legal for the target. - AddrMode.BaseOffs += CI->getSExtValue(); - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.BaseOffs -= CI->getSExtValue(); - } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { - // If this is a global variable, try to fold it into the addressing mode. - if (AddrMode.BaseGV == 0) { - AddrMode.BaseGV = GV; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.BaseGV = 0; - } - } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - - // Check to see if it is possible to fold this operation. - if (MatchOperationAddr(I, I->getOpcode(), Depth)) { - // Okay, it's possible to fold this. Check to see if it is actually - // *profitable* to do so. We use a simple cost model to avoid increasing - // register pressure too much. - if (I->hasOneUse() || - IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { - AddrModeInsts.push_back(I); - return true; - } - - // It isn't profitable to do this, roll back. - //cerr << "NOT FOLDING: " << *I; - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - } - } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { - if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) - return true; - } else if (isa<ConstantPointerNull>(Addr)) { - // Null pointer gets folded without affecting the addressing mode. - return true; - } - - // Worse case, the target should support [reg] addressing modes. :) - if (!AddrMode.HasBaseReg) { - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = Addr; - // Still check for legality in case the target supports [imm] but not [i+r]. - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.HasBaseReg = false; - AddrMode.BaseReg = 0; - } - - // If the base register is already taken, see if we can do [r+r]. - if (AddrMode.Scale == 0) { - AddrMode.Scale = 1; - AddrMode.ScaledReg = Addr; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.Scale = 0; - AddrMode.ScaledReg = 0; - } - // Couldn't match. - return false; -} - -/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified -/// inline asm call are due to memory operands. If so, return true, otherwise -/// return false. -static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetLowering &TLI) { - TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - - // Compute the constraint code and ConstraintType to use. - TLI.ComputeConstraintToUse(OpInfo, SDValue()); - - // If this asm operand is our Value*, and if it isn't an indirect memory - // operand, we can't fold it! - if (OpInfo.CallOperandVal == OpVal && - (OpInfo.ConstraintType != TargetLowering::C_Memory || - !OpInfo.isIndirect)) - return false; - } - - return true; -} - -/// FindAllMemoryUses - Recursively walk all the uses of I until we find a -/// memory use. If we find an obviously non-foldable instruction, return true. -/// Add the ultimately found memory instructions to MemoryUses. -static bool FindAllMemoryUses(Instruction *I, - SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, - SmallPtrSet<Instruction*, 16> &ConsideredInsts, - const TargetLowering &TLI) { - // If we already considered this instruction, we're done. - if (!ConsideredInsts.insert(I)) - return false; - - // If this is an obviously unfoldable instruction, bail out. - if (!MightBeFoldableInst(I)) - return true; - - // Loop over all the uses, recursively processing them. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) { - User *U = *UI; - - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - unsigned opNo = UI.getOperandNo(); - if (opNo == 0) return true; // Storing addr, not into addr. - MemoryUses.push_back(std::make_pair(SI, opNo)); - continue; - } - - if (CallInst *CI = dyn_cast<CallInst>(U)) { - InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); - if (!IA) return true; - - // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) - return true; - continue; - } - - if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, - TLI)) - return true; - } - - return false; -} - -/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at -/// the use site that we're folding it into. If so, there is no cost to -/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values -/// that we know are live at the instruction already. -bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, - Value *KnownLive2) { - // If Val is either of the known-live values, we know it is live! - if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) - return true; - - // All values other than instructions and arguments (e.g. constants) are live. - if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; - - // If Val is a constant sized alloca in the entry block, it is live, this is - // true because it is just a reference to the stack/frame pointer, which is - // live for the whole function. - if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) - if (AI->isStaticAlloca()) - return true; - - // Check to see if this value is already used in the memory instruction's - // block. If so, it's already live into the block at the very least, so we - // can reasonably fold it. - return Val->isUsedInBasicBlock(MemoryInst->getParent()); -} - -/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing -/// mode of the machine to fold the specified instruction into a load or store -/// that ultimately uses it. However, the specified instruction has multiple -/// uses. Given this, it may actually increase register pressure to fold it -/// into the load. For example, consider this code: -/// -/// X = ... -/// Y = X+1 -/// use(Y) -> nonload/store -/// Z = Y+1 -/// load Z -/// -/// In this case, Y has multiple uses, and can be folded into the load of Z -/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to -/// be live at the use(Y) line. If we don't fold Y into load Z, we use one -/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the -/// number of computations either. -/// -/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If -/// X was live across 'load Z' for other reasons, we actually *would* want to -/// fold the addressing mode in the Z case. This would make Y die earlier. -bool AddressingModeMatcher:: -IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, - ExtAddrMode &AMAfter) { - if (IgnoreProfitability) return true; - - // AMBefore is the addressing mode before this instruction was folded into it, - // and AMAfter is the addressing mode after the instruction was folded. Get - // the set of registers referenced by AMAfter and subtract out those - // referenced by AMBefore: this is the set of values which folding in this - // address extends the lifetime of. - // - // Note that there are only two potential values being referenced here, - // BaseReg and ScaleReg (global addresses are always available, as are any - // folded immediates). - Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; - - // If the BaseReg or ScaledReg was referenced by the previous addrmode, their - // lifetime wasn't extended by adding this instruction. - if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) - BaseReg = 0; - if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) - ScaledReg = 0; - - // If folding this instruction (and it's subexprs) didn't extend any live - // ranges, we're ok with it. - if (BaseReg == 0 && ScaledReg == 0) - return true; - - // If all uses of this instruction are ultimately load/store/inlineasm's, - // check to see if their addressing modes will include this instruction. If - // so, we can fold it into all uses, so it doesn't matter if it has multiple - // uses. - SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; - SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) - return false; // Has a non-memory, non-foldable use! - - // Now that we know that all uses of this instruction are part of a chain of - // computation involving only operations that could theoretically be folded - // into a memory use, loop over each of these uses and see if they could - // *actually* fold the instruction. - SmallVector<Instruction*, 32> MatchedAddrModeInsts; - for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { - Instruction *User = MemoryUses[i].first; - unsigned OpNo = MemoryUses[i].second; - - // Get the access type of this use. If the use isn't a pointer, we don't - // know what it accesses. - Value *Address = User->getOperand(OpNo); - if (!Address->getType()->isPointerTy()) - return false; - Type *AddressAccessTy = Address->getType()->getPointerElementType(); - - // Do a match against the root of this address, ignoring profitability. This - // will tell us if the addressing mode for the memory operation will - // *actually* cover the shared instruction. - ExtAddrMode Result; - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, - MemoryInst, Result); - Matcher.IgnoreProfitability = true; - bool Success = Matcher.MatchAddr(Address, 0); - (void)Success; assert(Success && "Couldn't select *anything*?"); - - // If the match didn't cover I, then it won't be shared by it. - if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), - I) == MatchedAddrModeInsts.end()) - return false; - - MatchedAddrModeInsts.clear(); - } - - return true; -} - -} // end anonymous namespace - -/// IsNonLocalValue - Return true if the specified values are defined in a -/// different basic block than BB. -static bool IsNonLocalValue(Value *V, BasicBlock *BB) { - if (Instruction *I = dyn_cast<Instruction>(V)) - return I->getParent() != BB; - return false; -} - -/// OptimizeMemoryInst - Load and Store Instructions often have -/// addressing modes that can do significant amounts of computation. As such, -/// instruction selection will try to get the load or store to do as much -/// computation as possible for the program. The problem is that isel can only -/// see within a single block. As such, we sink as much legal addressing mode -/// stuff into the block as possible. -/// -/// This method is used to optimize both load/store and inline asms with memory -/// operands. -bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, - Type *AccessTy) { - Value *Repl = Addr; - - // Try to collapse single-value PHI nodes. This is necessary to undo - // unprofitable PRE transformations. - SmallVector<Value*, 8> worklist; - SmallPtrSet<Value*, 16> Visited; - worklist.push_back(Addr); - - // Use a worklist to iteratively look through PHI nodes, and ensure that - // the addressing mode obtained from the non-PHI roots of the graph - // are equivalent. - Value *Consensus = 0; - unsigned NumUsesConsensus = 0; - bool IsNumUsesConsensusValid = false; - SmallVector<Instruction*, 16> AddrModeInsts; - ExtAddrMode AddrMode; - while (!worklist.empty()) { - Value *V = worklist.back(); - worklist.pop_back(); - - // Break use-def graph loops. - if (!Visited.insert(V)) { - Consensus = 0; - break; - } - - // For a PHI node, push all of its incoming values. - if (PHINode *P = dyn_cast<PHINode>(V)) { - for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) - worklist.push_back(P->getIncomingValue(i)); - continue; - } - - // For non-PHIs, determine the addressing mode being computed. - SmallVector<Instruction*, 16> NewAddrModeInsts; - ExtAddrMode NewAddrMode = - AddressingModeMatcher::Match(V, AccessTy, MemoryInst, - NewAddrModeInsts, *TLI); - - // This check is broken into two cases with very similar code to avoid using - // getNumUses() as much as possible. Some values have a lot of uses, so - // calling getNumUses() unconditionally caused a significant compile-time - // regression. - if (!Consensus) { - Consensus = V; - AddrMode = NewAddrMode; - AddrModeInsts = NewAddrModeInsts; - continue; - } else if (NewAddrMode == AddrMode) { - if (!IsNumUsesConsensusValid) { - NumUsesConsensus = Consensus->getNumUses(); - IsNumUsesConsensusValid = true; - } - - // Ensure that the obtained addressing mode is equivalent to that obtained - // for all other roots of the PHI traversal. Also, when choosing one - // such root as representative, select the one with the most uses in order - // to keep the cost modeling heuristics in AddressingModeMatcher - // applicable. - unsigned NumUses = V->getNumUses(); - if (NumUses > NumUsesConsensus) { - Consensus = V; - NumUsesConsensus = NumUses; - AddrModeInsts = NewAddrModeInsts; - } - continue; - } - - Consensus = 0; - break; - } - - // If the addressing mode couldn't be determined, or if multiple different - // ones were determined, bail out now. - if (!Consensus) return false; - - // Check to see if any of the instructions supersumed by this addr mode are - // non-local to I's BB. - bool AnyNonLocal = false; - for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) { - if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) { - AnyNonLocal = true; - break; - } - } - - // If all the instructions matched are already in this BB, don't do anything. - if (!AnyNonLocal) { - DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); - return false; - } - - // Insert this computation right after this user. Since our caller is - // scanning from the top of the BB to the bottom, reuse of the expr are - // guaranteed to happen later. - IRBuilder<> Builder(MemoryInst); - - // Now that we determined the addressing expression we want to use and know - // that we have to sink it into this block. Check to see if we have already - // done this for some other load/store instr in this block. If so, reuse the - // computation. - Value *&SunkAddr = SunkAddrs[Addr]; - if (SunkAddr) { - DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " - << *MemoryInst); - if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); - } else { - DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " - << *MemoryInst); - Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); - Value *Result = 0; - - // Start with the base register. Do this first so that subsequent address - // matching finds it last, which will prevent it from trying to match it - // as the scaled value in case it happens to be a mul. That would be - // problematic if we've sunk a different mul for the scale, because then - // we'd end up sinking both muls. - if (AddrMode.BaseReg) { - Value *V = AddrMode.BaseReg; - if (V->getType()->isPointerTy()) - V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); - if (V->getType() != IntPtrTy) - V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); - Result = V; - } - - // Add the scale value. - if (AddrMode.Scale) { - Value *V = AddrMode.ScaledReg; - if (V->getType() == IntPtrTy) { - // done. - } else if (V->getType()->isPointerTy()) { - V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); - } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < - cast<IntegerType>(V->getType())->getBitWidth()) { - V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); - } else { - V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr"); - } - if (AddrMode.Scale != 1) - V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), - "sunkaddr"); - if (Result) - Result = Builder.CreateAdd(Result, V, "sunkaddr"); - else - Result = V; - } - - // Add in the BaseGV if present. - if (AddrMode.BaseGV) { - Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); - if (Result) - Result = Builder.CreateAdd(Result, V, "sunkaddr"); - else - Result = V; - } - - // Add in the Base Offset if present. - if (AddrMode.BaseOffs) { - Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); - if (Result) - Result = Builder.CreateAdd(Result, V, "sunkaddr"); - else - Result = V; - } - - if (Result == 0) - SunkAddr = Constant::getNullValue(Addr->getType()); - else - SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); - } - - MemoryInst->replaceUsesOfWith(Repl, SunkAddr); - - // If we have no uses, recursively delete the value and all dead instructions - // using it. - if (Repl->use_empty()) { - // This can cause recursive deletion, which can invalidate our iterator. - // Use a WeakVH to hold onto it in case this happens. - WeakVH IterHandle(CurInstIterator); - BasicBlock *BB = CurInstIterator->getParent(); - - RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); - - if (IterHandle != CurInstIterator) { - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - CurInstIterator = BB->begin(); - SunkAddrs.clear(); - } - } - ++NumMemoryInsts; - return true; -} - -/// OptimizeInlineAsmInst - If there are any memory operands, use -/// OptimizeMemoryInst to sink their address computing into the block when -/// possible / profitable. -bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { - bool MadeChange = false; - - TargetLowering::AsmOperandInfoVector - TargetConstraints = TLI->ParseConstraints(CS); - unsigned ArgNo = 0; - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - - // Compute the constraint code and ConstraintType to use. - TLI->ComputeConstraintToUse(OpInfo, SDValue()); - - if (OpInfo.ConstraintType == TargetLowering::C_Memory && - OpInfo.isIndirect) { - Value *OpVal = CS->getArgOperand(ArgNo++); - MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType()); - } else if (OpInfo.Type == InlineAsm::isInput) - ArgNo++; - } - - return MadeChange; -} - -/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same -/// basic block as the load, unless conditions are unfavorable. This allows -/// SelectionDAG to fold the extend into the load. -/// -bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { - // Look for a load being extended. - LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0)); - if (!LI) return false; - - // If they're already in the same block, there's nothing to do. - if (LI->getParent() == I->getParent()) - return false; - - // If the load has other users and the truncate is not free, this probably - // isn't worthwhile. - if (!LI->hasOneUse() && - TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) || - !TLI->isTypeLegal(TLI->getValueType(I->getType()))) && - !TLI->isTruncateFree(I->getType(), LI->getType())) - return false; - - // Check whether the target supports casts folded into loads. - unsigned LType; - if (isa<ZExtInst>(I)) - LType = ISD::ZEXTLOAD; - else { - assert(isa<SExtInst>(I) && "Unexpected ext type!"); - LType = ISD::SEXTLOAD; - } - if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType()))) - return false; - - // Move the extend into the same block as the load, so that SelectionDAG - // can fold it. - I->removeFromParent(); - I->insertAfter(LI); - ++NumExtsMoved; - return true; -} - -bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { - BasicBlock *DefBB = I->getParent(); - - // If the result of a {s|z}ext and its source are both live out, rewrite all - // other uses of the source with result of extension. - Value *Src = I->getOperand(0); - if (Src->hasOneUse()) - return false; - - // Only do this xform if truncating is free. - if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) - return false; - - // Only safe to perform the optimization if the source is also defined in - // this block. - if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()) - return false; - - bool DefIsLiveOut = false; - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - - // Figure out which BB this ext is used in. - BasicBlock *UserBB = User->getParent(); - if (UserBB == DefBB) continue; - DefIsLiveOut = true; - break; - } - if (!DefIsLiveOut) - return false; - - // Make sure none of the uses are PHI nodes. - for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - BasicBlock *UserBB = User->getParent(); - if (UserBB == DefBB) continue; - // Be conservative. We don't want this xform to end up introducing - // reloads just before load / store instructions. - if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User)) - return false; - } - - // InsertedTruncs - Only insert one trunc in each block once. - DenseMap<BasicBlock*, Instruction*> InsertedTruncs; - - bool MadeChange = false; - for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); - UI != E; ++UI) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI); - - // Figure out which BB this ext is used in. - BasicBlock *UserBB = User->getParent(); - if (UserBB == DefBB) continue; - - // Both src and def are live in this block. Rewrite the use. - Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; - - if (!InsertedTrunc) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); - } - - // Replace a use of the {s|z}ext source with a use of the result. - TheUse = InsertedTrunc; - ++NumExtUses; - MadeChange = true; - } - - return MadeChange; -} - -/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be -/// turned into an explicit branch. -static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { - // FIXME: This should use the same heuristics as IfConversion to determine - // whether a select is better represented as a branch. This requires that - // branch probability metadata is preserved for the select, which is not the - // case currently. - - CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); - - // If the branch is predicted right, an out of order CPU can avoid blocking on - // the compare. Emit cmovs on compares with a memory operand as branches to - // avoid stalls on the load from memory. If the compare has more than one use - // there's probably another cmov or setcc around so it's not worth emitting a - // branch. - if (!Cmp) - return false; - - Value *CmpOp0 = Cmp->getOperand(0); - Value *CmpOp1 = Cmp->getOperand(1); - - // We check that the memory operand has one use to avoid uses of the loaded - // value directly after the compare, making branches unprofitable. - return Cmp->hasOneUse() && - ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || - (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())); -} - - -/// If we have a SelectInst that will likely profit from branch prediction, -/// turn it into a branch. -bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { - bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); - - // Can we convert the 'select' to CF ? - if (DisableSelectToBranch || OptSize || !TLI || VectorCond) - return false; - - TargetLowering::SelectSupportKind SelectKind; - if (VectorCond) - SelectKind = TargetLowering::VectorMaskSelect; - else if (SI->getType()->isVectorTy()) - SelectKind = TargetLowering::ScalarCondVectorVal; - else - SelectKind = TargetLowering::ScalarValSelect; - - // Do we have efficient codegen support for this kind of 'selects' ? - if (TLI->isSelectSupported(SelectKind)) { - // We have efficient codegen support for the select instruction. - // Check if it is profitable to keep this 'select'. - if (!TLI->isPredictableSelectExpensive() || - !isFormingBranchFromSelectProfitable(SI)) - return false; - } - - ModifiedDT = true; - - // First, we split the block containing the select into 2 blocks. - BasicBlock *StartBlock = SI->getParent(); - BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); - BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); - - // Create a new block serving as the landing pad for the branch. - BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", - NextBlock->getParent(), NextBlock); - - // Move the unconditional branch from the block with the select in it into our - // landing pad block. - StartBlock->getTerminator()->eraseFromParent(); - BranchInst::Create(NextBlock, SmallBlock); - - // Insert the real conditional branch based on the original condition. - BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); - - // The select itself is replaced with a PHI Node. - PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); - PN->takeName(SI); - PN->addIncoming(SI->getTrueValue(), StartBlock); - PN->addIncoming(SI->getFalseValue(), SmallBlock); - SI->replaceAllUsesWith(PN); - SI->eraseFromParent(); - - // Instruct OptimizeBlock to skip to the next block. - CurInstIterator = StartBlock->end(); - ++NumSelectsExpanded; - return true; -} - -bool CodeGenPrepare::OptimizeInst(Instruction *I) { - if (PHINode *P = dyn_cast<PHINode>(I)) { - // It is possible for very late stage optimizations (such as SimplifyCFG) - // to introduce PHI nodes too late to be cleaned up. If we detect such a - // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0, - TLInfo, DT)) { - P->replaceAllUsesWith(V); - P->eraseFromParent(); - ++NumPHIsElim; - return true; - } - return false; - } - - if (CastInst *CI = dyn_cast<CastInst>(I)) { - // If the source of the cast is a constant, then this should have - // already been constant folded. The only reason NOT to constant fold - // it is if something (e.g. LSR) was careful to place the constant - // evaluation in a block other than then one that uses it (e.g. to hoist - // the address of globals out of a loop). If this is the case, we don't - // want to forward-subst the cast. - if (isa<Constant>(CI->getOperand(0))) - return false; - - if (TLI && OptimizeNoopCopyExpression(CI, *TLI)) - return true; - - if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { - bool MadeChange = MoveExtToFormExtLoad(I); - return MadeChange | OptimizeExtUses(I); - } - return false; - } - - if (CmpInst *CI = dyn_cast<CmpInst>(I)) - return OptimizeCmpExpression(CI); - - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (TLI) - return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); - return false; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (TLI) - return OptimizeMemoryInst(I, SI->getOperand(1), - SI->getOperand(0)->getType()); - return false; - } - - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { - if (GEPI->hasAllZeroIndices()) { - /// The GEP operand must be a pointer, so must its result -> BitCast - Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), - GEPI->getName(), GEPI); - GEPI->replaceAllUsesWith(NC); - GEPI->eraseFromParent(); - ++NumGEPsElim; - OptimizeInst(NC); - return true; - } - return false; - } - - if (CallInst *CI = dyn_cast<CallInst>(I)) - return OptimizeCallInst(CI); - - if (SelectInst *SI = dyn_cast<SelectInst>(I)) - return OptimizeSelectInst(SI); - - return false; -} - -// In this pass we look for GEP and cast instructions that are used -// across basic blocks and rewrite them to improve basic-block-at-a-time -// selection. -bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { - SunkAddrs.clear(); - bool MadeChange = false; - - CurInstIterator = BB.begin(); - while (CurInstIterator != BB.end()) - MadeChange |= OptimizeInst(CurInstIterator++); - - MadeChange |= DupRetToEnableTailCallOpts(&BB); - - return MadeChange; -} - -// llvm.dbg.value is far away from the value then iSel may not be able -// handle it properly. iSel will drop llvm.dbg.value if it can not -// find a node corresponding to the value. -bool CodeGenPrepare::PlaceDbgValues(Function &F) { - bool MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - Instruction *PrevNonDbgInst = NULL; - for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { - Instruction *Insn = BI; ++BI; - DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); - if (!DVI) { - PrevNonDbgInst = Insn; - continue; - } - - Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); - if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { - DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); - DVI->removeFromParent(); - if (isa<PHINode>(VI)) - DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); - else - DVI->insertAfter(VI); - MadeChange = true; - ++NumDbgValueMoved; - } - } - } - return MadeChange; -} diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp new file mode 100644 index 000000000000..763d02b9fcd6 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -0,0 +1,602 @@ +//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies expensive constants to hoist and coalesces them to +// better prepare it for SelectionDAG-based code generation. This works around +// the limitations of the basic-block-at-a-time approach. +// +// First it scans all instructions for integer constants and calculates its +// cost. If the constant can be folded into the instruction (the cost is +// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't +// consider it expensive and leave it alone. This is the default behavior and +// the default implementation of getIntImmCost will always return TCC_Free. +// +// If the cost is more than TCC_BASIC, then the integer constant can't be folded +// into the instruction and it might be beneficial to hoist the constant. +// Similar constants are coalesced to reduce register pressure and +// materialization code. +// +// When a constant is hoisted, it is also hidden behind a bitcast to force it to +// be live-out of the basic block. Otherwise the constant would be just +// duplicated and each basic block would have its own copy in the SelectionDAG. +// The SelectionDAG recognizes such constants as opaque and doesn't perform +// certain transformations on them, which would create a new expensive constant. +// +// This optimization is only applied to integer constants in instructions and +// simple (this means not nested) constant cast expressions. For example: +// %0 = load i64* inttoptr (i64 big_constant to i64*) +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include <tuple> + +using namespace llvm; + +#define DEBUG_TYPE "consthoist" + +STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); +STATISTIC(NumConstantsRebased, "Number of constants rebased"); + +namespace { +struct ConstantUser; +struct RebasedConstantInfo; + +typedef SmallVector<ConstantUser, 8> ConstantUseListType; +typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType; + +/// \brief Keeps track of the user of a constant and the operand index where the +/// constant is used. +struct ConstantUser { + Instruction *Inst; + unsigned OpndIdx; + + ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { } +}; + +/// \brief Keeps track of a constant candidate and its uses. +struct ConstantCandidate { + ConstantUseListType Uses; + ConstantInt *ConstInt; + unsigned CumulativeCost; + + ConstantCandidate(ConstantInt *ConstInt) + : ConstInt(ConstInt), CumulativeCost(0) { } + + /// \brief Add the user to the use list and update the cost. + void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) { + CumulativeCost += Cost; + Uses.push_back(ConstantUser(Inst, Idx)); + } +}; + +/// \brief This represents a constant that has been rebased with respect to a +/// base constant. The difference to the base constant is recorded in Offset. +struct RebasedConstantInfo { + ConstantUseListType Uses; + Constant *Offset; + + RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset) + : Uses(Uses), Offset(Offset) { } +}; + +/// \brief A base constant and all its rebased constants. +struct ConstantInfo { + ConstantInt *BaseConstant; + RebasedConstantListType RebasedConstants; +}; + +/// \brief The constant hoisting pass. +class ConstantHoisting : public FunctionPass { + typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType; + typedef std::vector<ConstantCandidate> ConstCandVecType; + + const TargetTransformInfo *TTI; + DominatorTree *DT; + BasicBlock *Entry; + + /// Keeps track of constant candidates found in the function. + ConstCandVecType ConstCandVec; + + /// Keep track of cast instructions we already cloned. + SmallDenseMap<Instruction *, Instruction *> ClonedCastMap; + + /// These are the final constants we decided to hoist. + SmallVector<ConstantInfo, 8> ConstantVec; +public: + static char ID; // Pass identification, replacement for typeid + ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr), + Entry(nullptr) { + initializeConstantHoistingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + const char *getPassName() const override { return "Constant Hoisting"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfo>(); + } + +private: + /// \brief Initialize the pass. + void setup(Function &Fn) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + TTI = &getAnalysis<TargetTransformInfo>(); + Entry = &Fn.getEntryBlock(); + } + + /// \brief Cleanup. + void cleanup() { + ConstantVec.clear(); + ClonedCastMap.clear(); + ConstCandVec.clear(); + + TTI = nullptr; + DT = nullptr; + Entry = nullptr; + } + + Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const; + Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const; + void collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst, unsigned Idx, + ConstantInt *ConstInt); + void collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst); + void collectConstantCandidates(Function &Fn); + void findAndMakeBaseConstant(ConstCandVecType::iterator S, + ConstCandVecType::iterator E); + void findBaseConstants(); + void emitBaseConstants(Instruction *Base, Constant *Offset, + const ConstantUser &ConstUser); + bool emitBaseConstants(); + void deleteDeadCastInst() const; + bool optimizeConstants(Function &Fn); +}; +} + +char ConstantHoisting::ID = 0; +INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting", + false, false) + +FunctionPass *llvm::createConstantHoistingPass() { + return new ConstantHoisting(); +} + +/// \brief Perform the constant hoisting optimization for the given function. +bool ConstantHoisting::runOnFunction(Function &Fn) { + DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); + DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + + setup(Fn); + + bool MadeChange = optimizeConstants(Fn); + + if (MadeChange) { + DEBUG(dbgs() << "********** Function after Constant Hoisting: " + << Fn.getName() << '\n'); + DEBUG(dbgs() << Fn); + } + DEBUG(dbgs() << "********** End Constant Hoisting **********\n"); + + cleanup(); + + return MadeChange; +} + + +/// \brief Find the constant materialization insertion point. +Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, + unsigned Idx) const { + // If the operand is a cast instruction, then we have to materialize the + // constant before the cast instruction. + if (Idx != ~0U) { + Value *Opnd = Inst->getOperand(Idx); + if (auto CastInst = dyn_cast<Instruction>(Opnd)) + if (CastInst->isCast()) + return CastInst; + } + + // The simple and common case. This also includes constant expressions. + if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst)) + return Inst; + + // We can't insert directly before a phi node or landing pad. Insert before + // the terminator of the incoming or dominating block. + assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!"); + if (Idx != ~0U && isa<PHINode>(Inst)) + return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator(); + + BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock(); + return IDom->getTerminator(); +} + +/// \brief Find an insertion point that dominates all uses. +Instruction *ConstantHoisting:: +findConstantInsertionPoint(const ConstantInfo &ConstInfo) const { + assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); + // Collect all basic blocks. + SmallPtrSet<BasicBlock *, 8> BBs; + for (auto const &RCI : ConstInfo.RebasedConstants) + for (auto const &U : RCI.Uses) + BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); + + if (BBs.count(Entry)) + return &Entry->front(); + + while (BBs.size() >= 2) { + BasicBlock *BB, *BB1, *BB2; + BB1 = *BBs.begin(); + BB2 = *std::next(BBs.begin()); + BB = DT->findNearestCommonDominator(BB1, BB2); + if (BB == Entry) + return &Entry->front(); + BBs.erase(BB1); + BBs.erase(BB2); + BBs.insert(BB); + } + assert((BBs.size() == 1) && "Expected only one element."); + Instruction &FirstInst = (*BBs.begin())->front(); + return findMatInsertPt(&FirstInst); +} + + +/// \brief Record constant integer ConstInt for instruction Inst at operand +/// index Idx. +/// +/// The operand at index Idx is not necessarily the constant integer itself. It +/// could also be a cast instruction or a constant expression that uses the +// constant integer. +void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst, + unsigned Idx, + ConstantInt *ConstInt) { + unsigned Cost; + // Ask the target about the cost of materializing the constant for the given + // instruction and operand index. + if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst)) + Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx, + ConstInt->getValue(), ConstInt->getType()); + else + Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(), + ConstInt->getType()); + + // Ignore cheap integer constants. + if (Cost > TargetTransformInfo::TCC_Basic) { + ConstCandMapType::iterator Itr; + bool Inserted; + std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0)); + if (Inserted) { + ConstCandVec.push_back(ConstantCandidate(ConstInt)); + Itr->second = ConstCandVec.size() - 1; + } + ConstCandVec[Itr->second].addUser(Inst, Idx, Cost); + DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) + dbgs() << "Collect constant " << *ConstInt << " from " << *Inst + << " with cost " << Cost << '\n'; + else + dbgs() << "Collect constant " << *ConstInt << " indirectly from " + << *Inst << " via " << *Inst->getOperand(Idx) << " with cost " + << Cost << '\n'; + ); + } +} + +/// \brief Scan the instruction for expensive integer constants and record them +/// in the constant candidate vector. +void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Can't handle inline asm. Skip it. + if (auto Call = dyn_cast<CallInst>(Inst)) + if (isa<InlineAsm>(Call->getCalledValue())) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + Value *Opnd = Inst->getOperand(Idx); + + // Visit constant integers. + if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + continue; + } + + // Visit cast instructions that have constant integers. + if (auto CastInst = dyn_cast<Instruction>(Opnd)) { + // Only visit cast instructions, which have been skipped. All other + // instructions should have already been visited. + if (!CastInst->isCast()) + continue; + + if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the cast instruction. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + continue; + } + } + + // Visit constant expressions that have constant integers. + if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { + // Only visit constant cast expressions. + if (!ConstExpr->isCast()) + continue; + + if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the constant expression. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + continue; + } + } + } // end of for all operands +} + +/// \brief Collect all integer constants in the function that cannot be folded +/// into an instruction itself. +void ConstantHoisting::collectConstantCandidates(Function &Fn) { + ConstCandMapType ConstCandMap; + for (Function::iterator BB : Fn) + for (BasicBlock::iterator Inst : *BB) + collectConstantCandidates(ConstCandMap, Inst); +} + +/// \brief Find the base constant within the given range and rebase all other +/// constants with respect to the base constant. +void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S, + ConstCandVecType::iterator E) { + auto MaxCostItr = S; + unsigned NumUses = 0; + // Use the constant that has the maximum cost as base constant. + for (auto ConstCand = S; ConstCand != E; ++ConstCand) { + NumUses += ConstCand->Uses.size(); + if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) + MaxCostItr = ConstCand; + } + + // Don't hoist constants that have only one use. + if (NumUses <= 1) + return; + + ConstantInfo ConstInfo; + ConstInfo.BaseConstant = MaxCostItr->ConstInt; + Type *Ty = ConstInfo.BaseConstant->getType(); + + // Rebase the constants with respect to the base constant. + for (auto ConstCand = S; ConstCand != E; ++ConstCand) { + APInt Diff = ConstCand->ConstInt->getValue() - + ConstInfo.BaseConstant->getValue(); + Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff); + ConstInfo.RebasedConstants.push_back( + RebasedConstantInfo(std::move(ConstCand->Uses), Offset)); + } + ConstantVec.push_back(ConstInfo); +} + +/// \brief Finds and combines constant candidates that can be easily +/// rematerialized with an add from a common base constant. +void ConstantHoisting::findBaseConstants() { + // Sort the constants by value and type. This invalidates the mapping! + std::sort(ConstCandVec.begin(), ConstCandVec.end(), + [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) { + if (LHS.ConstInt->getType() != RHS.ConstInt->getType()) + return LHS.ConstInt->getType()->getBitWidth() < + RHS.ConstInt->getType()->getBitWidth(); + return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue()); + }); + + // Simple linear scan through the sorted constant candidate vector for viable + // merge candidates. + auto MinValItr = ConstCandVec.begin(); + for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end(); + CC != E; ++CC) { + if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) { + // Check if the constant is in range of an add with immediate. + APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue(); + if ((Diff.getBitWidth() <= 64) && + TTI->isLegalAddImmediate(Diff.getSExtValue())) + continue; + } + // We either have now a different constant type or the constant is not in + // range of an add with immediate anymore. + findAndMakeBaseConstant(MinValItr, CC); + // Start a new base constant search. + MinValItr = CC; + } + // Finalize the last base constant search. + findAndMakeBaseConstant(MinValItr, ConstCandVec.end()); +} + +/// \brief Updates the operand at Idx in instruction Inst with the result of +/// instruction Mat. If the instruction is a PHI node then special +/// handling for duplicate values form the same incomming basic block is +/// required. +/// \return The update will always succeed, but the return value indicated if +/// Mat was used for the update or not. +static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) { + if (auto PHI = dyn_cast<PHINode>(Inst)) { + // Check if any previous operand of the PHI node has the same incoming basic + // block. This is a very odd case that happens when the incoming basic block + // has a switch statement. In this case use the same value as the previous + // operand(s), otherwise we will fail verification due to different values. + // The values are actually the same, but the variable names are different + // and the verifier doesn't like that. + BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx); + for (unsigned i = 0; i < Idx; ++i) { + if (PHI->getIncomingBlock(i) == IncomingBB) { + Value *IncomingVal = PHI->getIncomingValue(i); + Inst->setOperand(Idx, IncomingVal); + return false; + } + } + } + + Inst->setOperand(Idx, Mat); + return true; +} + +/// \brief Emit materialization code for all rebased constants and update their +/// users. +void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset, + const ConstantUser &ConstUser) { + Instruction *Mat = Base; + if (Offset) { + Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst, + ConstUser.OpndIdx); + Mat = BinaryOperator::Create(Instruction::Add, Base, Offset, + "const_mat", InsertionPt); + + DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0) + << " + " << *Offset << ") in BB " + << Mat->getParent()->getName() << '\n' << *Mat << '\n'); + Mat->setDebugLoc(ConstUser.Inst->getDebugLoc()); + } + Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx); + + // Visit constant integer. + if (isa<ConstantInt>(Opnd)) { + DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset) + Mat->eraseFromParent(); + DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + return; + } + + // Visit cast instruction. + if (auto CastInst = dyn_cast<Instruction>(Opnd)) { + assert(CastInst->isCast() && "Expected an cast instruction!"); + // Check if we already have visited this cast instruction before to avoid + // unnecessary cloning. + Instruction *&ClonedCastInst = ClonedCastMap[CastInst]; + if (!ClonedCastInst) { + ClonedCastInst = CastInst->clone(); + ClonedCastInst->setOperand(0, Mat); + ClonedCastInst->insertAfter(CastInst); + // Use the same debug location as the original cast instruction. + ClonedCastInst->setDebugLoc(CastInst->getDebugLoc()); + DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n' + << "To : " << *ClonedCastInst << '\n'); + } + + DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst); + DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + return; + } + + // Visit constant expression. + if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { + Instruction *ConstExprInst = ConstExpr->getAsInstruction(); + ConstExprInst->setOperand(0, Mat); + ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst, + ConstUser.OpndIdx)); + + // Use the same debug location as the instruction we are about to update. + ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc()); + + DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n' + << "From : " << *ConstExpr << '\n'); + DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) { + ConstExprInst->eraseFromParent(); + if (Offset) + Mat->eraseFromParent(); + } + DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + return; + } +} + +/// \brief Hoist and hide the base constant behind a bitcast and emit +/// materialization code for derived constants. +bool ConstantHoisting::emitBaseConstants() { + bool MadeChange = false; + for (auto const &ConstInfo : ConstantVec) { + // Hoist and hide the base constant behind a bitcast. + Instruction *IP = findConstantInsertionPoint(ConstInfo); + IntegerType *Ty = ConstInfo.BaseConstant->getType(); + Instruction *Base = + new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); + DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB " + << IP->getParent()->getName() << '\n' << *Base << '\n'); + NumConstantsHoisted++; + + // Emit materialization code for all rebased constants. + for (auto const &RCI : ConstInfo.RebasedConstants) { + NumConstantsRebased++; + for (auto const &U : RCI.Uses) + emitBaseConstants(Base, RCI.Offset, U); + } + + // Use the same debug location as the last user of the constant. + assert(!Base->use_empty() && "The use list is empty!?"); + assert(isa<Instruction>(Base->user_back()) && + "All uses should be instructions."); + Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc()); + + // Correct for base constant, which we counted above too. + NumConstantsRebased--; + MadeChange = true; + } + return MadeChange; +} + +/// \brief Check all cast instructions we made a copy of and remove them if they +/// have no more users. +void ConstantHoisting::deleteDeadCastInst() const { + for (auto const &I : ClonedCastMap) + if (I.first->use_empty()) + I.first->eraseFromParent(); +} + +/// \brief Optimize expensive integer constants in the given function. +bool ConstantHoisting::optimizeConstants(Function &Fn) { + // Collect all constant candidates. + collectConstantCandidates(Fn); + + // There are no constant candidates to worry about. + if (ConstCandVec.empty()) + return false; + + // Combine constants that can be easily materialized with an add from a common + // base constant. + findBaseConstants(); + + // There are no constants to emit. + if (ConstantVec.empty()) + return false; + + // Finally hoist the base constant and emit materialization code for dependent + // constants. + bool MadeChange = emitBaseConstants(); + + // Cleanup dead instructions. + deleteDeadCastInst(); + + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index d5a96eceb993..dd51ce1bc28c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -18,19 +18,20 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "constprop" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" #include <set> using namespace llvm; +#define DEBUG_TYPE "constprop" + STATISTIC(NumInstKilled, "Number of instructions killed"); namespace { @@ -40,9 +41,9 @@ namespace { initializeConstantPropagationPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfo>(); } @@ -67,7 +68,8 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); while (!WorkList.empty()) { @@ -75,12 +77,11 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.erase(WorkList.begin()); // Get an element from the worklist... if (!I->use_empty()) // Don't muck with dead instructions... - if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) { + if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { // Add all of the users of this instruction to the worklist, they might // be constant propagatable now... - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) - WorkList.insert(cast<Instruction>(*UI)); + for (User *U : I->users()) + WorkList.insert(cast<Instruction>(U)); // Replace all of the uses of a variable with uses of the constant. I->replaceAllUsesWith(C); diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 995782e1bc6b..082946229b35 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -11,21 +11,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "correlated-value-propagation" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "correlated-value-propagation" + STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumSelects, "Number of selects propagated"); STATISTIC(NumMemAccess, "Number of memory access targets propagated"); @@ -48,9 +49,9 @@ namespace { initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); } }; @@ -138,7 +139,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { } bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { - Value *Pointer = 0; + Value *Pointer = nullptr; if (LoadInst *L = dyn_cast<LoadInst>(I)) Pointer = L->getPointerOperand(); else @@ -281,6 +282,9 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { } bool CorrelatedValuePropagation::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + LVI = &getAnalysis<LazyValueInfo>(); bool FnChanged = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index e8a090af40c3..99fac751df8e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -16,16 +16,17 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "dce" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "dce" + STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); STATISTIC(DCEEliminated, "Number of insts removed"); @@ -38,7 +39,9 @@ namespace { DeadInstElimination() : BasicBlockPass(ID) { initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnBasicBlock(BasicBlock &BB) { + bool runOnBasicBlock(BasicBlock &BB) override { + if (skipOptnoneFunction(BB)) + return false; TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { @@ -52,7 +55,7 @@ namespace { return Changed; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } }; @@ -77,9 +80,9 @@ namespace { initializeDCEPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } }; @@ -89,6 +92,9 @@ char DCE::ID = 0; INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) bool DCE::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); // Start out with all of the instructions in the worklist... diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 57432c7d71d8..3af8ee7546fb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -15,19 +15,18 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "dse" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" @@ -38,6 +37,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "dse" + STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); @@ -49,14 +50,17 @@ namespace { const TargetLibraryInfo *TLI; static char ID; // Pass identification, replacement for typeid - DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) { + DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) { initializeDSEPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F) { + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); TLI = AA->getTargetLibraryInfo(); bool Changed = false; @@ -66,7 +70,7 @@ namespace { if (DT->isReachableFromEntry(I)) Changed |= runOnBasicBlock(*I); - AA = 0; MD = 0; DT = 0; + AA = nullptr; MD = nullptr; DT = nullptr; return Changed; } @@ -76,13 +80,13 @@ namespace { void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, SmallSetVector<Value*, 16> &DeadStackObjects); - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } }; @@ -90,7 +94,7 @@ namespace { char DSE::ID = 0; INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) @@ -108,9 +112,9 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// If ValueSet is non-null, remove any deleted instructions from it as well. /// static void DeleteDeadInstruction(Instruction *I, - MemoryDependenceAnalysis &MD, - const TargetLibraryInfo *TLI, - SmallSetVector<Value*, 16> *ValueSet = 0) { + MemoryDependenceAnalysis &MD, + const TargetLibraryInfo *TLI, + SmallSetVector<Value*, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); @@ -128,7 +132,7 @@ static void DeleteDeadInstruction(Instruction *I, for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, 0); + DeadInst->setOperand(op, nullptr); // If this operand just became dead, add it to the NowDeadInsts list. if (!Op->use_empty()) continue; @@ -190,6 +194,7 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { /// describe the memory operations for this instruction. static AliasAnalysis::Location getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { + const DataLayout *DL = AA.getDataLayout(); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return AA.getLocation(SI); @@ -199,13 +204,13 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0) + if (Loc.Size == AliasAnalysis::UnknownSize && DL == nullptr) return AliasAnalysis::Location(); return Loc; } IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); - if (II == 0) return AliasAnalysis::Location(); + if (!II) return AliasAnalysis::Location(); switch (II->getIntrinsicID()) { default: return AliasAnalysis::Location(); // Unhandled intrinsic. @@ -213,7 +218,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. - if (AA.getDataLayout() == 0) return AliasAnalysis::Location(); + if (!DL) return AliasAnalysis::Location(); // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. @@ -341,6 +346,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, AliasAnalysis &AA, int64_t &EarlierOff, int64_t &LaterOff) { + const DataLayout *DL = AA.getDataLayout(); const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -354,8 +360,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // If we have no DataLayout information around, then the size of the store // is inferrable from the pointee type. If they are the same type, then // we know that the store is safe. - if (AA.getDataLayout() == 0 && - Later.Ptr->getType() == Earlier.Ptr->getType()) + if (DL == nullptr && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; return OverwriteUnknown; @@ -369,17 +374,14 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // Otherwise, we have to have size information, and the later store has to be // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || - Earlier.Size == AliasAnalysis::UnknownSize || - AA.getDataLayout() == 0) + Earlier.Size == AliasAnalysis::UnknownSize || DL == nullptr) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, - // an alloca, or a byval argument). If so, then it clearly overwrites any - // other store to the same object. - const DataLayout *TD = AA.getDataLayout(); - - const Value *UO1 = GetUnderlyingObject(P1, TD), - *UO2 = GetUnderlyingObject(P2, TD); + // an alloca, or a byval/inalloca argument). If so, then it clearly + // overwrites any other store to the same object. + const Value *UO1 = GetUnderlyingObject(P1, DL), + *UO2 = GetUnderlyingObject(P2, DL); // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. @@ -397,8 +399,8 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // pointers are equal, then we can reason about the two stores. EarlierOff = 0; LaterOff = 0; - const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD); - const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD); + const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL); + const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL); // If the base pointers still differ, we have two completely different stores. if (BP1 != BP2) @@ -460,7 +462,7 @@ static bool isPossibleSelfRead(Instruction *Inst, // Self reads can only happen for instructions that read memory. Get the // location read. AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); - if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. + if (!InstReadLoc.Ptr) return false; // Not a reading instruction. // If the read and written loc obviously don't alias, it isn't a read. if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; @@ -527,7 +529,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { DeleteDeadInstruction(SI, *MD, TLI); - if (NextInst == 0) // Next instruction deleted. + if (!NextInst) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. --BBI; @@ -542,7 +544,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. - if (Loc.Ptr == 0) + if (!Loc.Ptr) continue; while (InstDep.isDef() || InstDep.isClobber()) { @@ -556,7 +558,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { Instruction *DepWrite = InstDep.getInst(); AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); // If we didn't get a useful location, or if it isn't a size, bail out. - if (DepLoc.Ptr == 0) + if (!DepLoc.Ptr) break; // If we find a write that is a) removable (i.e., non-volatile), b) is @@ -679,7 +681,7 @@ bool DSE::HandleFree(CallInst *F) { if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - Instruction *Next = llvm::next(BasicBlock::iterator(Dependency)); + Instruction *Next = std::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD, TLI); @@ -701,22 +703,6 @@ bool DSE::HandleFree(CallInst *F) { return MadeChange; } -namespace { - struct CouldRef { - typedef Value *argument_type; - const CallSite CS; - AliasAnalysis *AA; - - bool operator()(Value *I) { - // See if the call site touches the value. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); - - return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; - } - }; -} - /// handleEndBlock - Remove dead stores to stack-allocated locations in the /// function end block. Ex: /// %A = alloca i32 @@ -742,11 +728,11 @@ bool DSE::handleEndBlock(BasicBlock &BB) { DeadStackObjects.insert(I); } - // Treat byval arguments the same, stores to them are dead at the end of the - // function. + // Treat byval or inalloca arguments the same, stores to them are dead at the + // end of the function. for (Function::arg_iterator AI = BB.getParent()->arg_begin(), AE = BB.getParent()->arg_end(); AI != AE; ++AI) - if (AI->hasByValAttr()) + if (AI->hasByValOrInAllocaAttr()) DeadStackObjects.insert(AI); // Scan the basic block backwards @@ -776,7 +762,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), E = Pointers.end(); I != E; ++I) { dbgs() << **I; - if (llvm::next(I) != E) + if (std::next(I) != E) dbgs() << ", "; } dbgs() << '\n'); @@ -818,8 +804,13 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. - CouldRef Pred = { CS, AA }; - DeadStackObjects.remove_if(Pred); + DeadStackObjects.remove_if([&](Value *I) { + // See if the call site touches the value. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + + return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + }); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. @@ -862,20 +853,6 @@ bool DSE::handleEndBlock(BasicBlock &BB) { return MadeChange; } -namespace { - struct CouldAlias { - typedef Value *argument_type; - const AliasAnalysis::Location &LoadedLoc; - AliasAnalysis *AA; - - bool operator()(Value *I) { - // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); - return !AA->isNoAlias(StackLoc, LoadedLoc); - } - }; -} - /// RemoveAccessedObjects - Check to see if the specified location may alias any /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. @@ -895,6 +872,9 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, } // Remove objects that could alias LoadedLoc. - CouldAlias Pred = { LoadedLoc, AA }; - DeadStackObjects.remove_if(Pred); + DeadStackObjects.remove_if([&](Value *I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + return !AA->isNoAlias(StackLoc, LoadedLoc); + }); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 5266894bc34c..735f5c194cb5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -12,23 +12,24 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "early-cse" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" -#include <deque> +#include <vector> using namespace llvm; +#define DEBUG_TYPE "early-cse" + STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); STATISTIC(NumCSE, "Number of instructions CSE'd"); STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); @@ -207,7 +208,7 @@ namespace { return false; CallInst *CI = dyn_cast<CallInst>(Inst); - if (CI == 0 || !CI->onlyReadsMemory()) + if (!CI || !CI->onlyReadsMemory()) return false; return true; } @@ -262,7 +263,7 @@ namespace { /// cases. class EarlyCSE : public FunctionPass { public: - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; DominatorTree *DT; typedef RecyclingAllocator<BumpPtrAllocator, @@ -303,7 +304,7 @@ public: initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; private: @@ -376,8 +377,8 @@ private: bool processNode(DomTreeNode *Node); // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); AU.setPreservesCFG(); } @@ -392,7 +393,7 @@ FunctionPass *llvm::createEarlyCSEPass() { } INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) @@ -405,14 +406,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // have invalidated the live-out memory values of our parent value. For now, // just be conservative and invalidate memory if this block has multiple // predecessors. - if (BB->getSinglePredecessor() == 0) + if (!BB->getSinglePredecessor()) ++CurrentGeneration; /// LastStore - Keep track of the last non-volatile store that we saw... for /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. - StoreInst *LastStore = 0; + StoreInst *LastStore = nullptr; bool Changed = false; @@ -432,7 +433,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) { + if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -462,7 +463,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { // Ignore volatile loads. if (!LI->isSimple()) { - LastStore = 0; + LastStore = nullptr; continue; } @@ -470,7 +471,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // generation, replace this instruction. std::pair<Value*, unsigned> InVal = AvailableLoads->lookup(Inst->getOperand(0)); - if (InVal.first != 0 && InVal.second == CurrentGeneration) { + if (InVal.first != nullptr && InVal.second == CurrentGeneration) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); @@ -483,20 +484,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // Otherwise, remember that we have this instruction. AvailableLoads->insert(Inst->getOperand(0), std::pair<Value*, unsigned>(Inst, CurrentGeneration)); - LastStore = 0; + LastStore = nullptr; continue; } // If this instruction may read from memory, forget LastStore. if (Inst->mayReadFromMemory()) - LastStore = 0; + LastStore = nullptr; // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); - if (InVal.first != 0 && InVal.second == CurrentGeneration) { + if (InVal.first != nullptr && InVal.second == CurrentGeneration) { DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); @@ -528,7 +529,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LastStore->eraseFromParent(); Changed = true; ++NumDSE; - LastStore = 0; + LastStore = nullptr; continue; } @@ -552,11 +553,15 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { bool EarlyCSE::runOnFunction(Function &F) { - std::deque<StackNode *> nodesToProcess; + if (skipOptnoneFunction(F)) + return false; + + std::vector<StackNode *> nodesToProcess; - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); // Tables that the pass uses when walking the domtree. ScopedHTType AVTable; @@ -570,7 +575,7 @@ bool EarlyCSE::runOnFunction(Function &F) { bool Changed = false; // Process the root node. - nodesToProcess.push_front( + nodesToProcess.push_back( new StackNode(AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, DT->getRootNode(), DT->getRootNode()->begin(), @@ -583,7 +588,7 @@ bool EarlyCSE::runOnFunction(Function &F) { while (!nodesToProcess.empty()) { // Grab the first item off the stack. Set the current generation, remove // the node from the stack, and process it. - StackNode *NodeToProcess = nodesToProcess.front(); + StackNode *NodeToProcess = nodesToProcess.back(); // Initialize class members. CurrentGeneration = NodeToProcess->currentGeneration(); @@ -597,7 +602,7 @@ bool EarlyCSE::runOnFunction(Function &F) { } else if (NodeToProcess->childIter() != NodeToProcess->end()) { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); - nodesToProcess.push_front( + nodesToProcess.push_back( new StackNode(AvailableValues, AvailableLoads, AvailableCalls, @@ -607,7 +612,7 @@ bool EarlyCSE::runOnFunction(Function &F) { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. delete NodeToProcess; - nodesToProcess.pop_front(); + nodesToProcess.pop_back(); } } // while (!nodes...) diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index e7de07f246db..0430c1898c8d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -11,14 +11,15 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "flattencfg" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/CFG.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "flattencfg" + namespace { struct FlattenCFGPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid @@ -26,9 +27,9 @@ public: FlattenCFGPass() : FunctionPass(ID) { initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 6af269dfed32..106eba099ca0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -15,35 +15,34 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Assembly/Writer.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/PatternMatch.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -51,6 +50,8 @@ using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "gvn" + STATISTIC(NumGVNInstr, "Number of instructions deleted"); STATISTIC(NumGVNLoad, "Number of loads deleted"); STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); @@ -214,13 +215,13 @@ Expression ValueTable::create_cmp_expression(unsigned Opcode, } Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { - assert(EI != 0 && "Not an ExtractValueInst?"); + assert(EI && "Not an ExtractValueInst?"); Expression e; e.type = EI->getType(); e.opcode = 0; IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand()); - if (I != 0 && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) { + if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) { // EI might be an extract from one of our recognised intrinsics. If it // is we'll synthesize a semantically equivalent expression instead on // an extract value expression. @@ -328,7 +329,7 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { const MemoryDependenceAnalysis::NonLocalDepInfo &deps = MD->getNonLocalCallDependency(CallSite(C)); // FIXME: Move the checking logic to MemDep! - CallInst* cdep = 0; + CallInst* cdep = nullptr; // Check to see if we have a single dominating call instruction that is // identical to C. @@ -339,8 +340,8 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { // We don't handle non-definitions. If we already have a call, reject // instruction dependencies. - if (!I->getResult().isDef() || cdep != 0) { - cdep = 0; + if (!I->getResult().isDef() || cdep != nullptr) { + cdep = nullptr; break; } @@ -351,7 +352,7 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { continue; } - cdep = 0; + cdep = nullptr; break; } @@ -552,7 +553,7 @@ namespace { static AvailableValueInBlock getUndef(BasicBlock *BB) { AvailableValueInBlock Res; Res.BB = BB; - Res.Val.setPointer(0); + Res.Val.setPointer(nullptr); Res.Val.setInt(UndefVal); Res.Offset = 0; return Res; @@ -587,7 +588,7 @@ namespace { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; SetVector<BasicBlock *> DeadBlocks; @@ -612,11 +613,11 @@ namespace { public: static char ID; // Pass identification, replacement for typeid explicit GVN(bool noloads = false) - : FunctionPass(ID), NoLoads(noloads), MD(0) { + : FunctionPass(ID), NoLoads(noloads), MD(nullptr) { initializeGVNPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; /// markInstructionForDeletion - This removes the specified instruction from /// our various maps and marks it for deletion. @@ -625,7 +626,7 @@ namespace { InstrsToErase.push_back(I); } - const DataLayout *getDataLayout() const { return TD; } + const DataLayout *getDataLayout() const { return DL; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } @@ -650,7 +651,7 @@ namespace { /// removeFromLeaderTable - Scan the list of values corresponding to a given /// value number, and remove the given instruction if encountered. void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { - LeaderTableEntry* Prev = 0; + LeaderTableEntry* Prev = nullptr; LeaderTableEntry* Curr = &LeaderTable[N]; while (Curr->Val != I || Curr->BB != BB) { @@ -662,8 +663,8 @@ namespace { Prev->Next = Curr->Next; } else { if (!Curr->Next) { - Curr->Val = 0; - Curr->BB = 0; + Curr->Val = nullptr; + Curr->BB = nullptr; } else { LeaderTableEntry* Next = Curr->Next; Curr->Val = Next->Val; @@ -677,14 +678,14 @@ namespace { SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit; // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AliasAnalysis>(); } @@ -727,7 +728,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) { INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) @@ -818,8 +819,7 @@ SpeculationFailure: // Mark as unavailable. EntryVal = 0; - for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I) - BBWorklist.push_back(*I); + BBWorklist.append(succ_begin(Entry), succ_end(Entry)); } while (!BBWorklist.empty()); return false; @@ -830,7 +830,7 @@ SpeculationFailure: /// CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const DataLayout &TD) { + const DataLayout &DL) { // If the loaded or stored value is an first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy() || @@ -839,8 +839,8 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return false; // The store has to be at least as big as the load. - if (TD.getTypeSizeInBits(StoredVal->getType()) < - TD.getTypeSizeInBits(LoadTy)) + if (DL.getTypeSizeInBits(StoredVal->getType()) < + DL.getTypeSizeInBits(LoadTy)) return false; return true; @@ -855,15 +855,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, - const DataLayout &TD) { - if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) - return 0; + const DataLayout &DL) { + if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL)) + return nullptr; // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); - uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); + uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy); + uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { @@ -874,13 +874,13 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // Convert source pointers to integers, which can be bitcast. if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy); + StoredValTy = DL.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } Type *TypeToCastTo = LoadedTy; if (TypeToCastTo->getScalarType()->isPointerTy()) - TypeToCastTo = TD.getIntPtrType(TypeToCastTo); + TypeToCastTo = DL.getIntPtrType(TypeToCastTo); if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); @@ -899,7 +899,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // Convert source pointers to integers, which can be manipulated. if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy); + StoredValTy = DL.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } @@ -911,7 +911,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // If this is a big-endian system, we need to shift the value down to the low // bits so that a truncate will work. - if (TD.isBigEndian()) { + if (DL.isBigEndian()) { Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize); StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt); } @@ -942,15 +942,15 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, - const DataLayout &TD) { + const DataLayout &DL) { // If the loaded or stored value is a first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) return -1; int64_t StoreOffset = 0, LoadOffset = 0; - Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD); - Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD); + Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&DL); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &DL); if (StoreBase != LoadBase) return -1; @@ -972,7 +972,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, // If the load and store don't overlap at all, the store doesn't provide // anything to the load. In this case, they really don't alias at all, AA // must have gotten confused. - uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); + uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy); if ((WriteSizeInBits & 7) | (LoadSize & 7)) return -1; @@ -1015,61 +1015,61 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, - const DataLayout &TD) { + const DataLayout &DL) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) return -1; Value *StorePtr = DepSI->getPointerOperand(); - uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType()); + uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()); return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, - StorePtr, StoreSize, TD); + StorePtr, StoreSize, DL); } /// AnalyzeLoadFromClobberingLoad - This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, - LoadInst *DepLI, const DataLayout &TD){ + LoadInst *DepLI, const DataLayout &DL){ // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; Value *DepPtr = DepLI->getPointerOperand(); - uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType()); - int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD); + uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()); + int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL); if (R != -1) return R; // If we have a load/load clobber an DepLI can be widened to cover this load, // then we should widen it! int64_t LoadOffs = 0; const Value *LoadBase = - GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD); - unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &DL); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy); unsigned Size = MemoryDependenceAnalysis:: - getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD); + getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, DL); if (Size == 0) return -1; - return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD); + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL); } static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, - const DataLayout &TD) { + const DataLayout &DL) { // If the mem operation is a non-constant size, we can't handle it. ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); - if (SizeCst == 0) return -1; + if (!SizeCst) return -1; uint64_t MemSizeInBits = SizeCst->getZExtValue()*8; // If this is memset, we just need to see if the offset is valid in the size // of the memset.. if (MI->getIntrinsicID() == Intrinsic::memset) return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), - MemSizeInBits, TD); + MemSizeInBits, DL); // If we have a memcpy/memmove, the only case we can handle is if this is a // copy from constant memory. In that case, we can read directly from the @@ -1077,14 +1077,14 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemTransferInst *MTI = cast<MemTransferInst>(MI); Constant *Src = dyn_cast<Constant>(MTI->getSource()); - if (Src == 0) return -1; + if (!Src) return -1; - GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); - if (GV == 0 || !GV->isConstant()) return -1; + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL)); + if (!GV || !GV->isConstant()) return -1; // See if the access is within the bounds of the transfer. int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, - MI->getDest(), MemSizeInBits, TD); + MI->getDest(), MemSizeInBits, DL); if (Offset == -1) return Offset; @@ -1097,7 +1097,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - if (ConstantFoldLoadFromConstPtr(Src, &TD)) + if (ConstantFoldLoadFromConstPtr(Src, &DL)) return Offset; return -1; } @@ -1110,11 +1110,11 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, /// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const DataLayout &TD){ + Instruction *InsertPt, const DataLayout &DL){ LLVMContext &Ctx = SrcVal->getType()->getContext(); - uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; - uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8; + uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; + uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8; IRBuilder<> Builder(InsertPt->getParent(), InsertPt); @@ -1122,13 +1122,13 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // to an integer type to start with. if (SrcVal->getType()->getScalarType()->isPointerTy()) SrcVal = Builder.CreatePtrToInt(SrcVal, - TD.getIntPtrType(SrcVal->getType())); + DL.getIntPtrType(SrcVal->getType())); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); // Shift the bits to the least significant depending on endianness. unsigned ShiftAmt; - if (TD.isLittleEndian()) + if (DL.isLittleEndian()) ShiftAmt = Offset*8; else ShiftAmt = (StoreSize-LoadSize-Offset)*8; @@ -1139,7 +1139,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, if (LoadSize != StoreSize) SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); - return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); + return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL); } /// GetLoadValueForLoad - This function is called when we have a @@ -1150,11 +1150,11 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const DataLayout &TD = *gvn.getDataLayout(); + const DataLayout &DL = *gvn.getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. - unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); - unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy); if (Offset+LoadSize > SrcValSize) { assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); @@ -1186,7 +1186,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, // Replace uses of the original load with the wider load. On a big endian // system, we need to shift down to get the relevant bits. Value *RV = NewLoad; - if (TD.isBigEndian()) + if (DL.isBigEndian()) RV = Builder.CreateLShr(RV, NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); RV = Builder.CreateTrunc(RV, SrcVal->getType()); @@ -1201,7 +1201,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, SrcVal = NewLoad; } - return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD); + return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL); } @@ -1209,9 +1209,9 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, - const DataLayout &TD){ + const DataLayout &DL){ LLVMContext &Ctx = LoadTy->getContext(); - uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; + uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8; IRBuilder<> Builder(InsertPt->getParent(), InsertPt); @@ -1242,7 +1242,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, ++NumBytesSet; } - return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD); + return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, DL); } // Otherwise, this is a memcpy/memmove from a constant global. @@ -1258,7 +1258,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - return ConstantFoldLoadFromConstPtr(Src, &TD); + return ConstantFoldLoadFromConstPtr(Src, &DL); } @@ -1324,10 +1324,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); + const DataLayout *DL = gvn.getDataLayout(); + assert(DL && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), - *TD); + *DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' @@ -1346,10 +1346,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *Res << '\n' << "\n\n\n"); } } else if (isMemIntrinValue()) { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); + const DataLayout *DL = gvn.getDataLayout(); + assert(DL && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, - LoadTy, BB->getTerminator(), *TD); + LoadTy, BB->getTerminator(), *DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1402,9 +1402,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // read by the load, we can extract the bits we need for the load from the // stored value. if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { - if (TD && Address) { + if (DL && Address) { int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, - DepSI, *TD); + DepSI, *DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, DepSI->getValueOperand(), @@ -1421,10 +1421,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { // If this is a clobber and L is the first instruction in its block, then // we have the first instruction in the entry block. - if (DepLI != LI && Address && TD) { - int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), - LI->getPointerOperand(), - DepLI, *TD); + if (DepLI != LI && Address && DL) { + int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), Address, + DepLI, *DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, @@ -1437,9 +1436,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // If the clobbering value is a memset/memcpy/memmove, see if we can // forward a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { - if (TD && Address) { + if (DL && Address) { int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, - DepMI, *TD); + DepMI, *DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, Offset)); @@ -1465,14 +1464,21 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, continue; } + // Loading from calloc (which zero initializes memory) -> zero + if (isCallocLikeFn(DepInst, TLI)) { + ValuesPerBlock.push_back(AvailableValueInBlock::get( + DepBB, Constant::getNullValue(LI->getType()))); + continue; + } + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of // different types if we have to. if (S->getValueOperand()->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), - LI->getType(), *TD)) { + if (!DL || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), *DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1488,7 +1494,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LD->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ + if (!DL || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1541,7 +1547,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Check to see how many predecessors have the loaded value fully // available. - DenseMap<BasicBlock*, Value*> PredLoads; + MapVector<BasicBlock *, Value *> PredLoads; DenseMap<BasicBlock*, char> FullyAvailableBlocks; for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; @@ -1555,7 +1561,6 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } - PredLoads[Pred] = 0; if (Pred->getTerminator()->getNumSuccessors() != 1) { if (isa<IndirectBrInst>(Pred->getTerminator())) { @@ -1572,11 +1577,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, } CriticalEdgePred.push_back(Pred); + } else { + // Only add the predecessors that will not be split for now. + PredLoads[Pred] = nullptr; } } // Decide whether PRE is profitable for this load. - unsigned NumUnavailablePreds = PredLoads.size(); + unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size(); assert(NumUnavailablePreds != 0 && "Fully available value should already be eliminated!"); @@ -1588,12 +1596,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; // Split critical edges, and update the unavailable predecessors accordingly. - for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(), - E = CriticalEdgePred.end(); I != E; I++) { - BasicBlock *OrigPred = *I; + for (BasicBlock *OrigPred : CriticalEdgePred) { BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); - PredLoads.erase(OrigPred); - PredLoads[NewPred] = 0; + assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!"); + PredLoads[NewPred] = nullptr; DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" << LoadBB->getName() << '\n'); } @@ -1601,9 +1607,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; SmallVector<Instruction*, 8> NewInsts; - for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), - E = PredLoads.end(); I != E; ++I) { - BasicBlock *UnavailablePred = I->first; + for (auto &PredLoad : PredLoads) { + BasicBlock *UnavailablePred = PredLoad.first; // Do PHI translation to get its value in the predecessor if necessary. The // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. @@ -1611,21 +1616,21 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), TD); - Value *LoadPtr = 0; + PHITransAddr Address(LI->getPointerOperand(), DL); + Value *LoadPtr = nullptr; LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, NewInsts); // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. - if (LoadPtr == 0) { + if (!LoadPtr) { DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " << *LI->getPointerOperand() << "\n"); CanDoPRE = false; break; } - I->second = LoadPtr; + PredLoad.second = LoadPtr; } if (!CanDoPRE) { @@ -1634,8 +1639,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (MD) MD->removeInstruction(I); I->eraseFromParent(); } - // HINT:Don't revert the edge-splitting as following transformation may - // also need to split these critial edges. + // HINT: Don't revert the edge-splitting as following transformation may + // also need to split these critical edges. return !CriticalEdgePred.empty(); } @@ -1656,10 +1661,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, VN.lookup_or_add(NewInsts[i]); } - for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), - E = PredLoads.end(); I != E; ++I) { - BasicBlock *UnavailablePred = I->first; - Value *LoadPtr = I->second; + for (const auto &PredLoad : PredLoads) { + BasicBlock *UnavailablePred = PredLoad.first; + Value *LoadPtr = PredLoad.second; Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, LI->getAlignment(), @@ -1712,7 +1716,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { DEBUG( dbgs() << "GVN: non-local load "; - WriteAsOperand(dbgs(), LI); + LI->printAsOperand(dbgs()); dbgs() << " has unknown dependencies\n"; ); return false; @@ -1778,7 +1782,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { MDNode *ReplMD = Metadata[i].second; switch(Kind) { default: - ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata + ReplInst->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); @@ -1789,11 +1793,15 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD)); break; case LLVMContext::MD_prof: - llvm_unreachable("MD_prof in a non terminator instruction"); + llvm_unreachable("MD_prof in a non-terminator instruction"); break; case LLVMContext::MD_fpmath: ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); break; + case LLVMContext::MD_invariant_load: + // Only set the !invariant.load if it is present in both instructions. + ReplInst->setMetadata(Kind, IMD); + break; } } } @@ -1823,7 +1831,7 @@ bool GVN::processLoad(LoadInst *L) { // If we have a clobber and target data is around, see if this is a clobber // that we can fix up through code synthesis. - if (Dep.isClobber() && TD) { + if (Dep.isClobber() && DL) { // Check to see if we have something like this: // store i32 123, i32* %P // %A = bitcast i32* %P to i8* @@ -1834,14 +1842,14 @@ bool GVN::processLoad(LoadInst *L) { // a common base + constant offset, and if the previous store (or memset) // completely covers this load. This sort of thing can happen in bitfield // access code. - Value *AvailVal = 0; + Value *AvailVal = nullptr; if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { int Offset = AnalyzeLoadFromClobberingStore(L->getType(), L->getPointerOperand(), - DepSI, *TD); + DepSI, *DL); if (Offset != -1) AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, - L->getType(), L, *TD); + L->getType(), L, *DL); } // Check to see if we have something like this: @@ -1856,7 +1864,7 @@ bool GVN::processLoad(LoadInst *L) { int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), L->getPointerOperand(), - DepLI, *TD); + DepLI, *DL); if (Offset != -1) AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); } @@ -1866,9 +1874,9 @@ bool GVN::processLoad(LoadInst *L) { if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), L->getPointerOperand(), - DepMI, *TD); + DepMI, *DL); if (Offset != -1) - AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD); + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *DL); } if (AvailVal) { @@ -1890,7 +1898,7 @@ bool GVN::processLoad(LoadInst *L) { DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; - WriteAsOperand(dbgs(), L); + L->printAsOperand(dbgs()); Instruction *I = Dep.getInst(); dbgs() << " is clobbered by " << *I << '\n'; ); @@ -1905,7 +1913,7 @@ bool GVN::processLoad(LoadInst *L) { DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; - WriteAsOperand(dbgs(), L); + L->printAsOperand(dbgs()); dbgs() << " has unknown dependence\n"; ); return false; @@ -1919,10 +1927,10 @@ bool GVN::processLoad(LoadInst *L) { // actually have the same type. See if we know how to reuse the stored // value (depending on its type). if (StoredVal->getType() != L->getType()) { - if (TD) { + if (DL) { StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), - L, *TD); - if (StoredVal == 0) + L, *DL); + if (!StoredVal) return false; DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal @@ -1948,10 +1956,10 @@ bool GVN::processLoad(LoadInst *L) { // the same type. See if we know how to reuse the previously loaded value // (depending on its type). if (DepLI->getType() != L->getType()) { - if (TD) { + if (DL) { AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), - L, *TD); - if (AvailableVal == 0) + L, *DL); + if (!AvailableVal) return false; DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal @@ -1991,6 +1999,15 @@ bool GVN::processLoad(LoadInst *L) { } } + // If this load follows a calloc (which zero initializes memory), + // then the loaded value is zero + if (isCallocLikeFn(DepInst, TLI)) { + L->replaceAllUsesWith(Constant::getNullValue(L->getType())); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + return false; } @@ -2001,9 +2018,9 @@ bool GVN::processLoad(LoadInst *L) { // a few comparisons of DFS numbers. Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { LeaderTableEntry Vals = LeaderTable[num]; - if (!Vals.Val) return 0; + if (!Vals.Val) return nullptr; - Value *Val = 0; + Value *Val = nullptr; if (DT->dominates(Vals.BB, BB)) { Val = Vals.Val; if (isa<Constant>(Val)) return Val; @@ -2030,7 +2047,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE; ) { - Use &U = (UI++).getUse(); + Use &U = *UI++; if (DT->dominates(Root, U)) { U.set(To); @@ -2054,7 +2071,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, const BasicBlock *Src = E.getStart(); assert((!Pred || Pred == Src) && "No edge between these basic blocks!"); (void)Src; - return Pred != 0; + return Pred != nullptr; } /// propagateEquality - The given values are known to be equal in every block @@ -2202,7 +2219,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. - if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); @@ -2298,7 +2315,7 @@ bool GVN::processInstruction(Instruction *I) { // Perform fast-path value-number based elimination of values inherited from // dominators. Value *repl = findLeader(I->getParent(), Num); - if (repl == 0) { + if (!repl) { // Failure, just remember this instance for future use. addToLeaderTable(Num, I, I->getParent()); return false; @@ -2314,10 +2331,14 @@ bool GVN::processInstruction(Instruction *I) { /// runOnFunction - This is the main transformation entry point for a function. bool GVN::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); - DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); @@ -2419,10 +2440,7 @@ bool GVN::processBlock(BasicBlock *BB) { bool GVN::performPRE(Function &F) { bool Changed = false; SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; - for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), - DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { - BasicBlock *CurrentBlock = *DI; - + for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { // Nothing to PRE in the entry block. if (CurrentBlock == &F.getEntryBlock()) continue; @@ -2462,7 +2480,7 @@ bool GVN::performPRE(Function &F) { // more complicated to get right. unsigned NumWith = 0; unsigned NumWithout = 0; - BasicBlock *PREPred = 0; + BasicBlock *PREPred = nullptr; predMap.clear(); for (pred_iterator PI = pred_begin(CurrentBlock), @@ -2480,8 +2498,8 @@ bool GVN::performPRE(Function &F) { } Value* predV = findLeader(P, ValNo); - if (predV == 0) { - predMap.push_back(std::make_pair(static_cast<Value *>(0), P)); + if (!predV) { + predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); PREPred = P; ++NumWithout; } else if (predV == CurInst) { @@ -2635,9 +2653,8 @@ bool GVN::iterateOnFunction(Function &F) { // std::vector<BasicBlock *> BBVect; BBVect.reserve(256); - for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), - DE = df_end(DT->getRootNode()); DI != DE; ++DI) - BBVect.push_back(DI->getBlock()); + for (DomTreeNode *x : depth_first(DT->getRootNode())) + BBVect.push_back(x->getBlock()); for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); I != E; I++) diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp deleted file mode 100644 index 954e5459810d..000000000000 --- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp +++ /dev/null @@ -1,310 +0,0 @@ -//===-- GlobalMerge.cpp - Internal globals merging -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// This pass merges globals with internal linkage into one. This way all the -// globals which were merged into a biggest one can be addressed using offsets -// from the same base pointer (no need for separate base pointer for each of the -// global). Such a transformation can significantly reduce the register pressure -// when many globals are involved. -// -// For example, consider the code which touches several global variables at -// once: -// -// static int foo[N], bar[N], baz[N]; -// -// for (i = 0; i < N; ++i) { -// foo[i] = bar[i] * baz[i]; -// } -// -// On ARM the addresses of 3 arrays should be kept in the registers, thus -// this code has quite large register pressure (loop body): -// -// ldr r1, [r5], #4 -// ldr r2, [r6], #4 -// mul r1, r2, r1 -// str r1, [r0], #4 -// -// Pass converts the code to something like: -// -// static struct { -// int foo[N]; -// int bar[N]; -// int baz[N]; -// } merged; -// -// for (i = 0; i < N; ++i) { -// merged.foo[i] = merged.bar[i] * merged.baz[i]; -// } -// -// and in ARM code this becomes: -// -// ldr r0, [r5, #40] -// ldr r1, [r5, #80] -// mul r0, r1, r0 -// str r0, [r5], #4 -// -// note that we saved 2 registers here almostly "for free". -// ===---------------------------------------------------------------------===// - -#define DEBUG_TYPE "global-merge" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -using namespace llvm; - -static cl::opt<bool> -EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, - cl::desc("Enable global merge pass on constants"), - cl::init(false)); - -STATISTIC(NumMerged , "Number of globals merged"); -namespace { - class GlobalMerge : public FunctionPass { - const TargetMachine *TM; - - bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst, unsigned AddrSpace) const; - - /// \brief Check if the given variable has been identified as must keep - /// \pre setMustKeepGlobalVariables must have been called on the Module that - /// contains GV - bool isMustKeepGlobalVariable(const GlobalVariable *GV) const { - return MustKeepGlobalVariables.count(GV); - } - - /// Collect every variables marked as "used" or used in a landing pad - /// instruction for this Module. - void setMustKeepGlobalVariables(Module &M); - - /// Collect every variables marked as "used" - void collectUsedGlobalVariables(Module &M); - - /// Keep track of the GlobalVariable that must not be merged away - SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables; - - public: - static char ID; // Pass identification, replacement for typeid. - explicit GlobalMerge(const TargetMachine *TM = 0) - : FunctionPass(ID), TM(TM) { - initializeGlobalMergePass(*PassRegistry::getPassRegistry()); - } - - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); - virtual bool doFinalization(Module &M); - - const char *getPassName() const { - return "Merge internal globals"; - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - } - - struct GlobalCmp { - const DataLayout *TD; - - GlobalCmp(const DataLayout *td) : TD(td) { } - - bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { - Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); - Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); - - return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); - } - }; - }; -} // end anonymous namespace - -char GlobalMerge::ID = 0; -INITIALIZE_PASS(GlobalMerge, "global-merge", - "Global Merge", false, false) - - -bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst, unsigned AddrSpace) const { - const TargetLowering *TLI = TM->getTargetLowering(); - const DataLayout *TD = TLI->getDataLayout(); - - // FIXME: Infer the maximum possible offset depending on the actual users - // (these max offsets are different for the users inside Thumb or ARM - // functions) - unsigned MaxOffset = TLI->getMaximalGlobalOffset(); - - // FIXME: Find better heuristics - std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD)); - - Type *Int32Ty = Type::getInt32Ty(M.getContext()); - - for (size_t i = 0, e = Globals.size(); i != e; ) { - size_t j = 0; - uint64_t MergedSize = 0; - std::vector<Type*> Tys; - std::vector<Constant*> Inits; - for (j = i; j != e; ++j) { - Type *Ty = Globals[j]->getType()->getElementType(); - MergedSize += TD->getTypeAllocSize(Ty); - if (MergedSize > MaxOffset) { - break; - } - Tys.push_back(Ty); - Inits.push_back(Globals[j]->getInitializer()); - } - - StructType *MergedTy = StructType::get(M.getContext(), Tys); - Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); - GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, - GlobalValue::InternalLinkage, - MergedInit, "_MergedGlobals", - 0, GlobalVariable::NotThreadLocal, - AddrSpace); - for (size_t k = i; k < j; ++k) { - Constant *Idx[2] = { - ConstantInt::get(Int32Ty, 0), - ConstantInt::get(Int32Ty, k-i) - }; - Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx); - Globals[k]->replaceAllUsesWith(GEP); - Globals[k]->eraseFromParent(); - NumMerged++; - } - i = j; - } - - return true; -} - -void GlobalMerge::collectUsedGlobalVariables(Module &M) { - // Extract global variables from llvm.used array - const GlobalVariable *GV = M.getGlobalVariable("llvm.used"); - if (!GV || !GV->hasInitializer()) return; - - // Should be an array of 'i8*'. - const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer()); - - for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) - if (const GlobalVariable *G = - dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts())) - MustKeepGlobalVariables.insert(G); -} - -void GlobalMerge::setMustKeepGlobalVariables(Module &M) { - collectUsedGlobalVariables(M); - - for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn; - ++IFn) { - for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end(); - IBB != IEndBB; ++IBB) { - // Follow the inwoke link to find the landing pad instruction - const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator()); - if (!II) continue; - - const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst(); - // Look for globals in the clauses of the landing pad instruction - for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses(); - Idx != NumClauses; ++Idx) - if (const GlobalVariable *GV = - dyn_cast<GlobalVariable>(LPInst->getClause(Idx) - ->stripPointerCasts())) - MustKeepGlobalVariables.insert(GV); - } - } -} - -bool GlobalMerge::doInitialization(Module &M) { - DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, - BSSGlobals; - const TargetLowering *TLI = TM->getTargetLowering(); - const DataLayout *TD = TLI->getDataLayout(); - unsigned MaxOffset = TLI->getMaximalGlobalOffset(); - bool Changed = false; - setMustKeepGlobalVariables(M); - - // Grab all non-const globals. - for (Module::global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) { - // Merge is safe for "normal" internal globals only - if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) - continue; - - PointerType *PT = dyn_cast<PointerType>(I->getType()); - assert(PT && "Global variable is not a pointer!"); - - unsigned AddressSpace = PT->getAddressSpace(); - - // Ignore fancy-aligned globals for now. - unsigned Alignment = TD->getPreferredAlignment(I); - Type *Ty = I->getType()->getElementType(); - if (Alignment > TD->getABITypeAlignment(Ty)) - continue; - - // Ignore all 'special' globals. - if (I->getName().startswith("llvm.") || - I->getName().startswith(".llvm.")) - continue; - - // Ignore all "required" globals: - if (isMustKeepGlobalVariable(I)) - continue; - - if (TD->getTypeAllocSize(Ty) < MaxOffset) { - if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine()) - .isBSSLocal()) - BSSGlobals[AddressSpace].push_back(I); - else if (I->isConstant()) - ConstGlobals[AddressSpace].push_back(I); - else - Globals[AddressSpace].push_back(I); - } - } - - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = Globals.begin(), E = Globals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); - - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); - - if (EnableGlobalMergeOnConst) - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, true, I->first); - - return Changed; -} - -bool GlobalMerge::runOnFunction(Function &F) { - return false; -} - -bool GlobalMerge::doFinalization(Module &M) { - MustKeepGlobalVariables.clear(); - return false; -} - -Pass *llvm::createGlobalMergePass(const TargetMachine *TM) { - return new GlobalMerge(TM); -} diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 235aaaa6f801..e83a5c421b47 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -24,23 +24,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "indvars" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -50,6 +49,8 @@ #include "llvm/Transforms/Utils/SimplifyIndVar.h" using namespace llvm; +#define DEBUG_TYPE "indvars" + STATISTIC(NumWidened , "Number of indvars widened"); STATISTIC(NumReplaced , "Number of exit values replaced"); STATISTIC(NumLFTR , "Number of loop exit tests replaced"); @@ -63,12 +64,15 @@ static cl::opt<bool> VerifyIndvars( "verify-indvars", cl::Hidden, cl::desc("Verify the ScalarEvolution result after running indvars")); +static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden, + cl::desc("Reduce live induction variables.")); + namespace { class IndVarSimplify : public LoopPass { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - DataLayout *TD; + const DataLayout *DL; TargetLibraryInfo *TLI; SmallVector<WeakVH, 16> DeadInsts; @@ -76,15 +80,15 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0), - Changed(false) { + IndVarSimplify() : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), + DL(nullptr), Changed(false) { initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); @@ -96,7 +100,7 @@ namespace { } private: - virtual void releaseMemory() { + void releaseMemory() override { DeadInsts.clear(); } @@ -119,7 +123,7 @@ namespace { char IndVarSimplify::ID = 0; INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -193,7 +197,7 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, if (!PHI) return User; - Instruction *InsertPt = 0; + Instruction *InsertPt = nullptr; for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { if (PHI->getIncomingValue(i) != Def) continue; @@ -254,34 +258,34 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // an add or increment value can not be represented by an integer. BinaryOperator *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); - if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return; + if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return; // If this is not an add of the PHI with a constantfp, or if the constant fp // is not an integer, bail out. ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); int64_t IncValue; - if (IncValueVal == 0 || Incr->getOperand(0) != PN || + if (IncValueVal == nullptr || Incr->getOperand(0) != PN || !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) return; // Check Incr uses. One user is PN and the other user is an exit condition // used by the conditional terminator. - Value::use_iterator IncrUse = Incr->use_begin(); + Value::user_iterator IncrUse = Incr->user_begin(); Instruction *U1 = cast<Instruction>(*IncrUse++); - if (IncrUse == Incr->use_end()) return; + if (IncrUse == Incr->user_end()) return; Instruction *U2 = cast<Instruction>(*IncrUse++); - if (IncrUse != Incr->use_end()) return; + if (IncrUse != Incr->user_end()) return; // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't // only used by a branch, we can't transform it. FCmpInst *Compare = dyn_cast<FCmpInst>(U1); if (!Compare) Compare = dyn_cast<FCmpInst>(U2); - if (Compare == 0 || !Compare->hasOneUse() || - !isa<BranchInst>(Compare->use_back())) + if (!Compare || !Compare->hasOneUse() || + !isa<BranchInst>(Compare->user_back())) return; - BranchInst *TheBr = cast<BranchInst>(Compare->use_back()); + BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); // We need to verify that the branch actually controls the iteration count // of the loop. If not, the new IV can overflow and no one will notice. @@ -298,7 +302,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // transform it. ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); int64_t ExitValue; - if (ExitValueVal == 0 || + if (ExitValueVal == nullptr || !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) return; @@ -494,6 +498,21 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { unsigned NumPreds = PN->getNumIncomingValues(); + // We would like to be able to RAUW single-incoming value PHI nodes. We + // have to be certain this is safe even when this is an LCSSA PHI node. + // While the computed exit value is no longer varying in *this* loop, the + // exit block may be an exit block for an outer containing loop as well, + // the exit value may be varying in the outer loop, and thus it may still + // require an LCSSA PHI node. The safe case is when this is + // single-predecessor PHI node (LCSSA) and the exit block containing it is + // part of the enclosing loop, or this is the outer most loop of the nest. + // In either case the exit value could (at most) be varying in the same + // loop body as the phi node itself. Thus if it is in turn used outside of + // an enclosing loop it will only be via a separate LCSSA node. + bool LCSSASafePhiForRAUW = + NumPreds == 1 && + (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB)); + // Iterate over all of the PHI nodes. BasicBlock::iterator BBI = ExitBB->begin(); while ((PN = dyn_cast<PHINode>(BBI++))) { @@ -545,8 +564,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { unsigned NumHardInternalUses = 0; unsigned NumSoftExternalUses = 0; unsigned NumUses = 0; - for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end(); - IB!=IE && NumUses<=6 ; ++IB) { + for (auto IB = Inst->user_begin(), IE = Inst->user_end(); + IB != IE && NumUses <= 6; ++IB) { Instruction *UseInstr = cast<Instruction>(*IB); unsigned Opc = UseInstr->getOpcode(); NumUses++; @@ -558,9 +577,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Do not count the Phi as a use. LCSSA may have inserted // plenty of trivial ones. NumUses--; - for (Value::use_iterator PB=UseInstr->use_begin(), - PE=UseInstr->use_end(); - PB!=PE && NumUses<=6 ; ++PB, ++NumUses) { + for (auto PB = UseInstr->user_begin(), + PE = UseInstr->user_end(); + PB != PE && NumUses <= 6; ++PB, ++NumUses) { unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode(); if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret) NumSoftExternalUses++; @@ -594,17 +613,18 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { if (isInstructionTriviallyDead(Inst, TLI)) DeadInsts.push_back(Inst); - if (NumPreds == 1) { - // Completely replace a single-pred PHI. This is safe, because the - // NewVal won't be variant in the loop, so we don't need an LCSSA phi - // node anymore. + // If we determined that this PHI is safe to replace even if an LCSSA + // PHI, do so. + if (LCSSASafePhiForRAUW) { PN->replaceAllUsesWith(ExitVal); PN->eraseFromParent(); } } - if (NumPreds != 1) { - // Clone the PHI and delete the original one. This lets IVUsers and - // any other maps purge the original user from their records. + + // If we were unable to completely replace the PHI node, clone the PHI + // and delete the original one. This lets IVUsers and any other maps + // purge the original user from their records. + if (!LCSSASafePhiForRAUW) { PHINode *NewPN = cast<PHINode>(PN->clone()); NewPN->takeName(PN); NewPN->insertBefore(PN); @@ -632,36 +652,23 @@ namespace { Type *WidestNativeType; // Widest integer type created [sz]ext bool IsSigned; // Was an sext user seen before a zext? - WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {} - }; - - class WideIVVisitor : public IVVisitor { - ScalarEvolution *SE; - const DataLayout *TD; - - public: - WideIVInfo WI; - - WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, - const DataLayout *TData) : - SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } - - // Implement the interface used by simplifyUsersOfIV. - virtual void visitCast(CastInst *Cast); + WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), + IsSigned(false) {} }; } /// visitCast - Update information about the induction variable that is /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. -void WideIVVisitor::visitCast(CastInst *Cast) { +static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, + const DataLayout *DL) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) return; Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); - if (TD && !TD->isLegalInteger(Width)) + if (DL && !DL->isLegalInteger(Width)) return; if (!WI.WidestNativeType) { @@ -688,7 +695,7 @@ struct NarrowIVDefUse { Instruction *NarrowUse; Instruction *WideDef; - NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {} + NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {} NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} @@ -731,9 +738,9 @@ public: L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree), - WidePhi(0), - WideInc(0), - WideIncExpr(0), + WidePhi(nullptr), + WideInc(nullptr), + WideIncExpr(nullptr), DeadInsts(DI) { assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); } @@ -788,7 +795,7 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { unsigned Opcode = DU.NarrowUse->getOpcode(); switch (Opcode) { default: - return 0; + return nullptr; case Instruction::Add: case Instruction::Mul: case Instruction::UDiv: @@ -833,14 +840,14 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { // Handle the common case of add<nsw/nuw> if (DU.NarrowUse->getOpcode() != Instruction::Add) - return 0; + return nullptr; // One operand (NarrowDef) has already been extended to WideDef. Now determine // if extending the other will lead to a recurrence. unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); - const SCEV *ExtendOperExpr = 0; + const SCEV *ExtendOperExpr = nullptr; const OverflowingBinaryOperator *OBO = cast<OverflowingBinaryOperator>(DU.NarrowUse); if (IsSigned && OBO->hasNoSignedWrap()) @@ -850,7 +857,7 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { ExtendOperExpr = SE->getZeroExtendExpr( SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); else - return 0; + return nullptr; // When creating this AddExpr, don't apply the current operations NSW or NUW // flags. This instruction may be guarded by control flow that the no-wrap @@ -861,7 +868,7 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr)); if (!AddRec || AddRec->getLoop() != L) - return 0; + return nullptr; return AddRec; } @@ -872,14 +879,14 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { /// recurrence. Otherwise return NULL. const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) - return 0; + return nullptr; const SCEV *NarrowExpr = SE->getSCEV(NarrowUse); if (SE->getTypeSizeInBits(NarrowExpr->getType()) >= SE->getTypeSizeInBits(WideType)) { // NarrowUse implicitly widens its operand. e.g. a gep with a narrow // index. So don't follow this use. - return 0; + return nullptr; } const SCEV *WideExpr = IsSigned ? @@ -887,19 +894,47 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { SE->getZeroExtendExpr(NarrowExpr, WideType); const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr); if (!AddRec || AddRec->getLoop() != L) - return 0; + return nullptr; return AddRec; } +/// This IV user cannot be widen. Replace this use of the original narrow IV +/// with a truncation of the new wide IV to isolate and eliminate the narrow IV. +static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { + DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef + << " for user " << *DU.NarrowUse << "\n"); + IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); +} + /// WidenIVUse - Determine whether an individual user of the narrow IV can be /// widened. If so, return the wide clone of the user. Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Stop traversing the def-use chain at inner-loop phis or post-loop phis. - if (isa<PHINode>(DU.NarrowUse) && - LI->getLoopFor(DU.NarrowUse->getParent()) != L) - return 0; - + if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) { + if (LI->getLoopFor(UsePhi->getParent()) != L) { + // For LCSSA phis, sink the truncate outside the loop. + // After SimplifyCFG most loop exit targets have a single predecessor. + // Otherwise fall back to a truncate within the loop. + if (UsePhi->getNumOperands() != 1) + truncateIVUse(DU, DT); + else { + PHINode *WidePhi = + PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide", + UsePhi); + WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); + IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt()); + Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); + UsePhi->replaceAllUsesWith(Trunc); + DeadInsts.push_back(UsePhi); + DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi + << " to " << *WidePhi << "\n"); + } + return nullptr; + } + } // Our raison d'etre! Eliminate sign and zero extension. if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) { Value *NewDef = DU.WideDef; @@ -935,7 +970,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // push the uses of WideDef here. // No further widening is needed. The deceased [sz]ext had done it for us. - return 0; + return nullptr; } // Does this user itself evaluate to a recurrence after widening? @@ -947,10 +982,8 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); - Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); - DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); - return 0; + truncateIVUse(DU, DT); + return nullptr; } // Assume block terminators cannot evaluate to a recurrence. We can't to // insert a Trunc after a terminator if there happens to be a critical edge. @@ -959,14 +992,14 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Reuse the IV increment that SCEVExpander created as long as it dominates // NarrowUse. - Instruction *WideUse = 0; + Instruction *WideUse = nullptr; if (WideAddRec == WideIncExpr && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) WideUse = WideInc; else { WideUse = CloneIVUser(DU); if (!WideUse) - return 0; + return nullptr; } // Evaluation of WideAddRec ensured that the narrow expression could be // extended outside the loop without overflow. This suggests that the wide use @@ -977,7 +1010,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n"); DeadInsts.push_back(WideUse); - return 0; + return nullptr; } // Returning WideUse pushes it on the worklist. @@ -987,15 +1020,14 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { /// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. /// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { - for (Value::use_iterator UI = NarrowDef->use_begin(), - UE = NarrowDef->use_end(); UI != UE; ++UI) { - Instruction *NarrowUse = cast<Instruction>(*UI); + for (User *U : NarrowDef->users()) { + Instruction *NarrowUser = cast<Instruction>(U); // Handle data flow merges and bizarre phi cycles. - if (!Widened.insert(NarrowUse)) + if (!Widened.insert(NarrowUser)) continue; - NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef)); + NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); } } @@ -1013,7 +1045,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Is this phi an induction variable? const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); if (!AddRec) - return NULL; + return nullptr; // Widen the induction variable expression. const SCEV *WideIVExpr = IsSigned ? @@ -1026,7 +1058,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Can the IV be extended outside the loop without overflow? AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr); if (!AddRec || AddRec->getLoop() != L) - return NULL; + return nullptr; // An AddRec must have loop-invariant operands. Since this AddRec is // materialized by a loop header phi, the expression cannot have any post-loop @@ -1080,9 +1112,36 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { } //===----------------------------------------------------------------------===// +// Live IV Reduction - Minimize IVs live across the loop. +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// // Simplification of IV users based on SCEV evaluation. //===----------------------------------------------------------------------===// +namespace { + class IndVarSimplifyVisitor : public IVVisitor { + ScalarEvolution *SE; + const DataLayout *DL; + PHINode *IVPhi; + + public: + WideIVInfo WI; + + IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, + const DataLayout *DL, const DominatorTree *DTree): + SE(SCEV), DL(DL), IVPhi(IV) { + DT = DTree; + WI.NarrowIV = IVPhi; + if (ReduceLiveIVs) + setSplitOverflowIntrinsics(); + } + + // Implement the interface used by simplifyUsersOfIV. + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, DL); } + }; +} /// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV /// users. Each successive simplification may push more users which may @@ -1114,12 +1173,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - WideIVVisitor WIV(CurrIV, SE, TD); + IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV); + Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); - if (WIV.WI.WidestNativeType) { - WideIVs.push_back(WIV.WI); + if (Visitor.WI.WidestNativeType) { + WideIVs.push_back(Visitor.WI); } } while(!LoopPhis.empty()); @@ -1225,7 +1284,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { Instruction *IncI = dyn_cast<Instruction>(IncV); if (!IncI) - return 0; + return nullptr; switch (IncI->getOpcode()) { case Instruction::Add: @@ -1236,17 +1295,17 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { if (IncI->getNumOperands() == 2) break; default: - return 0; + return nullptr; } PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); if (Phi && Phi->getParent() == L->getHeader()) { if (isLoopInvariant(IncI->getOperand(1), L, DT)) return Phi; - return 0; + return nullptr; } if (IncI->getOpcode() == Instruction::GetElementPtr) - return 0; + return nullptr; // Allow add/sub to be commuted. Phi = dyn_cast<PHINode>(IncI->getOperand(1)); @@ -1254,7 +1313,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { if (isLoopInvariant(IncI->getOperand(0), L, DT)) return Phi; } - return 0; + return nullptr; } /// Return the compare guarding the loop latch, or NULL for unrecognized tests. @@ -1264,7 +1323,7 @@ static ICmpInst *getLoopTest(Loop *L) { BasicBlock *LatchBlock = L->getLoopLatch(); // Don't bother with LFTR if the loop is not properly simplified. if (!LatchBlock) - return 0; + return nullptr; BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); assert(BI && "expected exit branch"); @@ -1359,15 +1418,11 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); Value *IncV = Phi->getIncomingValue(LatchIdx); - for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end(); - UI != UE; ++UI) { - if (*UI != Cond && *UI != IncV) return false; - } + for (User *U : Phi->users()) + if (U != Cond && U != IncV) return false; - for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end(); - UI != UE; ++UI) { - if (*UI != Cond && *UI != Phi) return false; - } + for (User *U : IncV->users()) + if (U != Cond && U != Phi) return false; return true; } @@ -1386,15 +1441,15 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// could at least handle constant BECounts. static PHINode * FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) { + ScalarEvolution *SE, DominatorTree *DT, const DataLayout *DL) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition(); // Loop over all of the PHI nodes, looking for a simple counter. - PHINode *BestPhi = 0; - const SCEV *BestInit = 0; + PHINode *BestPhi = nullptr; + const SCEV *BestInit = nullptr; BasicBlock *LatchBlock = L->getLoopLatch(); assert(LatchBlock && "needsLFTR should guarantee a loop latch"); @@ -1415,7 +1470,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount, // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); - if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth))) + if (PhiWidth < BCWidth || (DL && !DL->isLegalInteger(PhiWidth))) continue; const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); @@ -1518,7 +1573,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // IVInit integer and IVCount pointer would only occur if a canonical IV // were generated on top of case #2, which is not expected. - const SCEV *IVLimit = 0; + const SCEV *IVLimit = nullptr; // For unit stride, IVCount = Start + BECount with 2's complement overflow. // For non-zero Start, compute IVCount here. if (AR->getStart()->isZero()) @@ -1697,13 +1752,12 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { // Determine if there is a use in or before the loop (direct or // otherwise). bool UsedInLoop = false; - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) { - User *U = *UI; - BasicBlock *UseBB = cast<Instruction>(U)->getParent(); - if (PHINode *P = dyn_cast<PHINode>(U)) { + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); + BasicBlock *UseBB = User->getParent(); + if (PHINode *P = dyn_cast<PHINode>(User)) { unsigned i = - PHINode::getIncomingValueNumForOperand(UI.getOperandNo()); + PHINode::getIncomingValueNumForOperand(U.getOperandNo()); UseBB = P->getIncomingBlock(i); } if (UseBB == Preheader || L->contains(UseBB)) { @@ -1743,6 +1797,9 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { //===----------------------------------------------------------------------===// bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + // If LoopSimplify form is not available, stay out of trouble. Some notes: // - LSR currently only supports LoopSimplify-form loops. Indvars' // canonicalization can be a pessimization without LSR to "clean up" @@ -1756,8 +1813,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); - DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); DeadInsts.clear(); @@ -1799,13 +1857,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) { - PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); + PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, DL); if (IndVar) { // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead any // pass that uses the SCEVExpander must do it. This does not work well for - // loop passes because SCEVExpander makes assumptions about all loops, while - // LoopPassManager only forces the current loop to be simplified. + // loop passes because SCEVExpander makes assumptions about all loops, + // while LoopPassManager only forces the current loop to be simplified. // // FIXME: SCEV expansion has no way to bail out, so the caller must // explicitly check any assumptions made by SCEV. Brittle. diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b3ec2fc84c03..21f80385cf46 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "jump-threading" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -27,10 +26,10 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -38,6 +37,8 @@ #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; +#define DEBUG_TYPE "jump-threading" + STATISTIC(NumThreads, "Number of jumps threaded"); STATISTIC(NumFolds, "Number of terminators folded"); STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); @@ -76,7 +77,7 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - DataLayout *TD; + const DataLayout *DL; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -105,9 +106,9 @@ namespace { initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); AU.addRequired<TargetLibraryInfo>(); @@ -148,11 +149,24 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } /// runOnFunction - Top level algorithm. /// bool JumpThreading::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); LVI = &getAnalysis<LazyValueInfo>(); + // Remove unreachable blocks from function as they may result in infinite + // loop. We do threading if we found something profitable. Jump threading a + // branch can create other opportunities. If these opportunities form a cycle + // i.e. if any jump treading is undoing previous threading in the path, then + // we will loop forever. We take care of this issue by not jump threading for + // back edges. This works for normal cases but not for unreachable blocks as + // they may have cycle with no back edge. + removeUnreachableBlocks(F); + FindLoopHeaders(F); bool Changed, EverChanged = false; @@ -251,7 +265,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->hasFnAttr(Attribute::NoDuplicate)) + if (CI->cannotDuplicate()) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; @@ -304,7 +318,7 @@ void JumpThreading::FindLoopHeaders(Function &F) { /// Returns null if Val is null or not an appropriate constant. static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { if (!Val) - return 0; + return nullptr; // Undef is "known" enough. if (UndefValue *U = dyn_cast<UndefValue>(Val)) @@ -348,7 +362,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // If V is a non-instruction value, or an instruction in a different block, // then it can't be derived from a PHI. Instruction *I = dyn_cast<Instruction>(V); - if (I == 0 || I->getParent() != BB) { + if (!I || I->getParent() != BB) { // Okay, if this is a live-in value, see if it has a known value at the end // of any of our predecessors. @@ -490,8 +504,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, Value *LHS = PN->getIncomingValue(i); Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); - Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD); - if (Res == 0) { + Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL); + if (!Res) { if (!isa<Constant>(RHS)) continue; @@ -577,7 +591,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // Either operand will do, so be sure to pick the one that's a known // constant. // FIXME: Do this more cleverly if both values are known constants? - KnownCond = (TrueVal != 0); + KnownCond = (TrueVal != nullptr); } // See if the select has a known constant value for this predecessor. @@ -655,14 +669,9 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { if (LoopHeaders.erase(SinglePred)) LoopHeaders.insert(BB); - // Remember if SinglePred was the entry block of the function. If so, we - // will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB); - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); return true; } } @@ -692,7 +701,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // Run constant folding to see if we can reduce the condition to a simple // constant. if (Instruction *I = dyn_cast<Instruction>(Condition)) { - Value *SimpleVal = ConstantFoldInstruction(I, TD, TLI); + Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); I->eraseFromParent(); @@ -733,7 +742,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { Instruction *CondInst = dyn_cast<Instruction>(Condition); // All the rest of our checks depend on the condition being an instruction. - if (CondInst == 0) { + if (!CondInst) { // FIXME: Unify this with code below. if (ProcessThreadableEdges(Condition, BB, Preference)) return true; @@ -886,7 +895,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { SmallPtrSet<BasicBlock*, 8> PredsScanned; typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; AvailablePredsTy AvailablePreds; - BasicBlock *OneUnavailablePred = 0; + BasicBlock *OneUnavailablePred = nullptr; // If we got here, the loaded value is transparent through to the start of the // block. Check to see if it is available in any of the predecessor blocks. @@ -900,16 +909,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); - MDNode *ThisTBAATag = 0; + MDNode *ThisTBAATag = nullptr; Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, - 0, &ThisTBAATag); + nullptr, &ThisTBAATag); if (!PredAvailable) { OneUnavailablePred = PredBB; continue; } // If tbaa tags disagree or are not present, forget about them. - if (TBAATag != ThisTBAATag) TBAATag = 0; + if (TBAATag != ThisTBAATag) TBAATag = nullptr; // If so, this load is partially redundant. Remember this info so that we // can create a PHI node. @@ -925,7 +934,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // predecessor, we want to insert a merge block for those common predecessors. // This ensures that we only have to insert one reload, thus not increasing // code size. - BasicBlock *UnavailablePred = 0; + BasicBlock *UnavailablePred = nullptr; // If there is exactly one predecessor where the value is unavailable, the // already computed 'OneUnavailablePred' block is it. If it ends in an @@ -992,7 +1001,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { BasicBlock *P = *PI; AvailablePredsTy::iterator I = std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), - std::make_pair(P, (Value*)0)); + std::make_pair(P, (Value*)nullptr)); assert(I != AvailablePreds.end() && I->first == P && "Didn't find entry for predecessor!"); @@ -1099,7 +1108,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, SmallPtrSet<BasicBlock*, 16> SeenPreds; SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; - BasicBlock *OnlyDest = 0; + BasicBlock *OnlyDest = nullptr; BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { @@ -1116,7 +1125,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, BasicBlock *DestBB; if (isa<UndefValue>(Val)) - DestBB = 0; + DestBB = nullptr; else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { @@ -1167,7 +1176,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, // If the threadable edges are branching on an undefined value, we get to pick // the destination that these predecessors should get to. - if (MostPopularDest == 0) + if (!MostPopularDest) MostPopularDest = BB->getTerminator()-> getSuccessor(GetBestDestForJumpOnUndef(BB)); @@ -1269,7 +1278,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { } // Determine which value to split on, true, false, or undef if neither. - ConstantInt *SplitVal = 0; + ConstantInt *SplitVal = nullptr; if (NumTrue > NumFalse) SplitVal = ConstantInt::getTrue(BB->getContext()); else if (NumTrue != 0 || NumFalse != 0) @@ -1290,7 +1299,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // help us. However, we can just replace the LHS or RHS with the constant. if (BlocksToFoldInto.size() == cast<PHINode>(BB->front()).getNumIncomingValues()) { - if (SplitVal == 0) { + if (!SplitVal) { // If all preds provide undef, just nuke the xor, because it is undef too. BO->replaceAllUsesWith(UndefValue::get(BO->getType())); BO->eraseFromParent(); @@ -1431,16 +1440,15 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; - ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(UI) == BB) + if (UserPN->getIncomingBlock(U) == BB) continue; } else if (User->getParent() == BB) continue; - UsesToRename.push_back(&UI.getUse()); + UsesToRename.push_back(&U); } // If there are no uses outside the block, we're done with this instruction. @@ -1475,7 +1483,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. - SimplifyInstructionsInBlock(NewBB, TD, TLI); + SimplifyInstructionsInBlock(NewBB, DL, TLI); // Threaded an edge! ++NumThreads; @@ -1528,7 +1536,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // can just clone the bits from BB into the end of the new PredBB. BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); - if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) { + if (!OldPredBranch || !OldPredBranch->isUnconditional()) { PredBB = SplitEdge(PredBB, BB, this); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } @@ -1557,7 +1565,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. - if (Value *IV = SimplifyInstruction(New, TD)) { + if (Value *IV = SimplifyInstruction(New, DL)) { delete New; ValueMapping[BI] = IV; } else { @@ -1585,16 +1593,15 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; - ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(UI) == BB) + if (UserPN->getIncomingBlock(U) == BB) continue; } else if (User->getParent() == BB) continue; - UsesToRename.push_back(&UI.getUse()); + UsesToRename.push_back(&U); } // If there are no uses outside the block, we're done with this instruction. diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index f94cd2a073ef..abcceb20050a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -30,33 +30,37 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "licm" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/PredIteratorCache.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "licm" + STATISTIC(NumSunk , "Number of instructions sunk out of loop"); STATISTIC(NumHoisted , "Number of instructions hoisted out of loop"); STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); @@ -74,26 +78,28 @@ namespace { initializeLICMPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved("scalar-evolution"); - AU.addPreservedID(LoopSimplifyID); + AU.addPreserved<ScalarEvolution>(); AU.addRequired<TargetLibraryInfo>(); } using llvm::Pass::doFinalization; - bool doFinalization() { + bool doFinalization() override { assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); return false; } @@ -103,7 +109,7 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - DataLayout *TD; // DataLayout for constant folding. + const DataLayout *DL; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -117,11 +123,12 @@ namespace { DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. - void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L); + void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, + Loop *L) override; /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias /// set. - void deleteAnalysisValue(Value *V, Loop *L); + void deleteAnalysisValue(Value *V, Loop *L) override; /// SinkRegion - Walk the specified region of the CFG (defined by all blocks /// dominated by the specified block, and that are in the current loop) in @@ -183,15 +190,26 @@ namespace { void PromoteAliasSet(AliasSet &AS, SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts); + SmallVectorImpl<Instruction*> &InsertPts, + PredIteratorCache &PIC); + + /// \brief Create a copy of the instruction in the exit block and patch up + /// SSA. + /// PN is a user of I in ExitBlock that can be used to get the number and + /// list of predecessors fast. + Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN); }; } char LICM::ID = 0; INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) @@ -203,16 +221,22 @@ Pass *llvm::createLICMPass() { return new LICM(); } /// times on one loop. /// bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + Changed = false; // Get our Loop and Alias Analysis information... LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); + assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); + CurAST = new AliasSetTracker(*AA); // Collect Alias info from subloops. for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end(); @@ -272,19 +296,36 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. - if (!DisablePromotion && Preheader && L->hasDedicatedExits()) { + if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) { SmallVector<BasicBlock *, 8> ExitBlocks; SmallVector<Instruction *, 8> InsertPts; + PredIteratorCache PIC; // Loop over all of the alias sets in the tracker object. for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); I != E; ++I) - PromoteAliasSet(*I, ExitBlocks, InsertPts); + PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC); + + // Once we have promoted values across the loop body we have to recursively + // reform LCSSA as any nested loop may now have values defined within the + // loop used in the outer loop. + // FIXME: This is really heavy handed. It would be a bit better to use an + // SSAUpdater strategy during promotion that was LCSSA aware and reformed + // it as it went. + if (Changed) + formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>()); } + // Check that neither this loop nor its parent have had LCSSA broken. LICM is + // specifically moving instructions across the loop boundary and so it is + // especially in need of sanity checking here. + assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!"); + assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) && + "Parent loop not left in LCSSA form after LICM!"); + // Clear out loops state information for the next iteration - CurLoop = 0; - Preheader = 0; + CurLoop = nullptr; + Preheader = nullptr; // If this loop is nested inside of another one, save the alias information // for when we process the outer loop. @@ -302,7 +343,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { /// iteration. /// void LICM::SinkRegion(DomTreeNode *N) { - assert(N != 0 && "Null dominator tree node?"); + assert(N != nullptr && "Null dominator tree node?"); BasicBlock *BB = N->getBlock(); // If this subregion is not in the top level loop at all, exit. @@ -349,7 +390,7 @@ void LICM::SinkRegion(DomTreeNode *N) { /// before uses, allowing us to hoist a loop body in one pass without iteration. /// void LICM::HoistRegion(DomTreeNode *N) { - assert(N != 0 && "Null dominator tree node?"); + assert(N != nullptr && "Null dominator tree node?"); BasicBlock *BB = N->getBlock(); // If this subregion is not in the top level loop at all, exit. @@ -364,7 +405,7 @@ void LICM::HoistRegion(DomTreeNode *N) { // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just // fold it. - if (Constant *C = ConstantFoldInstruction(&I, TD, TLI)) { + if (Constant *C = ConstantFoldInstruction(&I, DL, TLI)) { DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); CurAST->copyValue(&I, C); CurAST->deleteValue(&I); @@ -450,26 +491,82 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { return isSafeToExecuteUnconditionally(I); } +/// \brief Returns true if a PHINode is a trivially replaceable with an +/// Instruction. +/// +/// This is true when all incoming values are that instruction. This pattern +/// occurs most often with LCSSA PHI nodes. +static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + if (PN.getIncomingValue(i) != &I) + return false; + + return true; +} + /// isNotUsedInLoop - Return true if the only users of this instruction are /// outside of the loop. If this is true, we can sink the instruction to the /// exit blocks of the loop. /// bool LICM::isNotUsedInLoop(Instruction &I) { - for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (PHINode *PN = dyn_cast<PHINode>(User)) { - // PHI node uses occur in predecessor blocks! + for (User *U : I.users()) { + Instruction *UI = cast<Instruction>(U); + if (PHINode *PN = dyn_cast<PHINode>(UI)) { + // A PHI node where all of the incoming values are this instruction are + // special -- they can just be RAUW'ed with the instruction and thus + // don't require a use in the predecessor. This is a particular important + // special case because it is the pattern found in LCSSA form. + if (isTriviallyReplacablePHI(*PN, I)) { + if (CurLoop->contains(PN)) + return false; + else + continue; + } + + // Otherwise, PHI node uses occur in predecessor blocks if the incoming + // values. Check for such a use being inside the loop. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == &I) if (CurLoop->contains(PN->getIncomingBlock(i))) return false; - } else if (CurLoop->contains(User)) { - return false; + + continue; } + + if (CurLoop->contains(UI)) + return false; } return true; } +Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN) { + Instruction *New = I.clone(); + ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); + if (!I.getName().empty()) New->setName(I.getName() + ".le"); + + // Build LCSSA PHI nodes for any in-loop operands. Note that this is + // particularly cheap because we can rip off the PHI node that we're + // replacing for the number and blocks of the predecessors. + // OPT: If this shows up in a profile, we can instead finish sinking all + // invariant instructions, and then walk their operands to re-establish + // LCSSA. That will eliminate creating PHI nodes just to nuke them when + // sinking bottom-up. + for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; + ++OI) + if (Instruction *OInst = dyn_cast<Instruction>(*OI)) + if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) + if (!OLoop->contains(&PN)) { + PHINode *OpPN = + PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), + OInst->getName() + ".lcssa", ExitBlock.begin()); + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); + *OI = OpPN; + } + return New; +} /// sink - When an instruction is found to only be used outside of the loop, /// this function moves it to the exit blocks and patches up SSA form as needed. @@ -479,119 +576,45 @@ bool LICM::isNotUsedInLoop(Instruction &I) { void LICM::sink(Instruction &I) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); - SmallVector<BasicBlock*, 8> ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumSunk; Changed = true; - // The case where there is only a single exit node of this loop is common - // enough that we handle it as a special (more efficient) case. It is more - // efficient to handle because there are no PHI nodes that need to be placed. - if (ExitBlocks.size() == 1) { - if (!DT->dominates(I.getParent(), ExitBlocks[0])) { - // Instruction is not used, just delete it. - CurAST->deleteValue(&I); - // If I has users in unreachable blocks, eliminate. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); - I.eraseFromParent(); - } else { - // Move the instruction to the start of the exit block, after any PHI - // nodes in it. - I.moveBefore(ExitBlocks[0]->getFirstInsertionPt()); - - // This instruction is no longer in the AST for the current loop, because - // we just sunk it out of the loop. If we just sunk it into an outer - // loop, we will rediscover the operation when we process it. - CurAST->deleteValue(&I); - } - return; - } - - if (ExitBlocks.empty()) { - // The instruction is actually dead if there ARE NO exit blocks. - CurAST->deleteValue(&I); - // If I has users in unreachable blocks, eliminate. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); - I.eraseFromParent(); - return; - } - - // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the - // hard work of inserting PHI nodes as necessary. - SmallVector<PHINode*, 8> NewPHIs; - SSAUpdater SSA(&NewPHIs); - - if (!I.use_empty()) - SSA.Initialize(I.getType(), I.getName()); - - // Insert a copy of the instruction in each exit block of the loop that is - // dominated by the instruction. Each exit block is known to only be in the - // ExitBlocks list once. - BasicBlock *InstOrigBB = I.getParent(); - unsigned NumInserted = 0; +#ifndef NDEBUG + SmallVector<BasicBlock *, 32> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); +#endif - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { - BasicBlock *ExitBlock = ExitBlocks[i]; + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies; - if (!DT->dominates(InstOrigBB, ExitBlock)) - continue; + // If this instruction is only used outside of the loop, then all users are + // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of + // the instruction. + while (!I.use_empty()) { + // The user must be a PHI node. + PHINode *PN = cast<PHINode>(I.user_back()); - // Insert the code after the last PHI node. - BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); + BasicBlock *ExitBlock = PN->getParent(); + assert(ExitBlockSet.count(ExitBlock) && + "The LCSSA PHI is not in an exit block!"); - // If this is the first exit block processed, just move the original - // instruction, otherwise clone the original instruction and insert - // the copy. Instruction *New; - if (NumInserted++ == 0) { - I.moveBefore(InsertPt); - New = &I; - } else { - New = I.clone(); - if (!I.getName().empty()) - New->setName(I.getName()+".le"); - ExitBlock->getInstList().insert(InsertPt, New); - } - - // Now that we have inserted the instruction, inform SSAUpdater. - if (!I.use_empty()) - SSA.AddAvailableValue(ExitBlock, New); - } - - // If the instruction doesn't dominate any exit blocks, it must be dead. - if (NumInserted == 0) { - CurAST->deleteValue(&I); - if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); - I.eraseFromParent(); - return; - } - - // Next, rewrite uses of the instruction, inserting PHI nodes as needed. - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) { - // Grab the use before incrementing the iterator. - Use &U = UI.getUse(); - // Increment the iterator before removing the use from the list. - ++UI; - SSA.RewriteUseAfterInsertions(U); + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(I, *ExitBlock, *PN); + + PN->replaceAllUsesWith(New); + PN->eraseFromParent(); } - // Update CurAST for NewPHIs if I had pointer type. - if (I.getType()->isPointerTy()) - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) - CurAST->copyValue(&I, NewPHIs[i]); - - // Finally, remove the instruction from CurAST. It is no longer in the loop. CurAST->deleteValue(&I); + I.eraseFromParent(); } /// hoist - When an instruction is found to only use loop invariant operands @@ -616,7 +639,7 @@ void LICM::hoist(Instruction &I) { /// bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { // If it is not a trapping instruction, it is always safe to hoist. - if (isSafeToSpeculativelyExecute(&Inst)) + if (isSafeToSpeculativelyExecute(&Inst, DL)) return true; return isGuaranteedToExecute(Inst); @@ -662,24 +685,42 @@ namespace { SmallPtrSet<Value*, 4> &PointerMustAliases; SmallVectorImpl<BasicBlock*> &LoopExitBlocks; SmallVectorImpl<Instruction*> &LoopInsertPts; + PredIteratorCache &PredCache; AliasSetTracker &AST; + LoopInfo &LI; DebugLoc DL; int Alignment; MDNode *TBAATag; + + Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (Loop *L = LI.getLoopFor(I->getParent())) + if (!L->contains(BB)) { + // We need to create an LCSSA PHI node for the incoming value and + // store that. + PHINode *PN = PHINode::Create( + I->getType(), PredCache.GetNumPreds(BB), + I->getName() + ".lcssa", BB->begin()); + for (BasicBlock **PI = PredCache.GetPreds(BB); *PI; ++PI) + PN->addIncoming(I, *PI); + return PN; + } + return V; + } + public: - LoopPromoter(Value *SP, - const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, - SmallPtrSet<Value*, 4> &PMA, - SmallVectorImpl<BasicBlock*> &LEB, - SmallVectorImpl<Instruction*> &LIP, - AliasSetTracker &ast, DebugLoc dl, int alignment, + LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts, + SSAUpdater &S, SmallPtrSet<Value *, 4> &PMA, + SmallVectorImpl<BasicBlock *> &LEB, + SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, + AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, MDNode *TBAATag) - : LoadAndStorePromoter(Insts, S), SomePtr(SP), - PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), - AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} + : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), + LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), + LI(li), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} - virtual bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &) const { + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -688,7 +729,7 @@ namespace { return PointerMustAliases.count(Ptr); } - virtual void doExtraRewritesBeforeFinalDeletion() const { + void doExtraRewritesBeforeFinalDeletion() const override { // Insert stores after in the loop exit blocks. Each exit block gets a // store of the live-out values that feed them. Since we've already told // the SSA updater about the defs in the loop and the preheader @@ -696,19 +737,21 @@ namespace { for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = LoopExitBlocks[i]; Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock); + Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock); Instruction *InsertPos = LoopInsertPts[i]; - StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); + StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag); } } - virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const { + void replaceLoadWithValue(LoadInst *LI, Value *V) const override { // Update alias analysis. AST.copyValue(LI, V); } - virtual void instructionDeleted(Instruction *I) const { + void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); } }; @@ -721,7 +764,8 @@ namespace { /// void LICM::PromoteAliasSet(AliasSet &AS, SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts) { + SmallVectorImpl<Instruction*> &InsertPts, + PredIteratorCache &PIC) { // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. @@ -754,7 +798,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; - MDNode *TBAATag = 0; + MDNode *TBAATag = nullptr; // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in @@ -769,23 +813,22 @@ void LICM::PromoteAliasSet(AliasSet &AS, if (SomePtr->getType() != ASIV->getType()) return; - for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end(); - UI != UE; ++UI) { + for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. - Instruction *Use = dyn_cast<Instruction>(*UI); - if (!Use || !CurLoop->contains(Use)) + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || !CurLoop->contains(UI)) continue; // If there is an non-load/store instruction in the loop, we can't promote // it. - if (LoadInst *load = dyn_cast<LoadInst>(Use)) { + if (LoadInst *load = dyn_cast<LoadInst>(UI)) { assert(!load->isVolatile() && "AST broken"); if (!load->isSimple()) return; - } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) { + } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. - if (Use->getOperand(1) != ASIV) + if (UI->getOperand(1) != ASIV) continue; assert(!store->isVolatile() && "AST broken"); if (!store->isSimple()) @@ -801,13 +844,13 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) - if (isGuaranteedToExecute(*Use)) { + if (isGuaranteedToExecute(*UI)) { GuaranteedToExecute = true; Alignment = InstAlignment; } if (!GuaranteedToExecute) - GuaranteedToExecute = isGuaranteedToExecute(*Use); + GuaranteedToExecute = isGuaranteedToExecute(*UI); } else return; // Not a load or store. @@ -815,13 +858,13 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Merge the TBAA tags. if (LoopUses.empty()) { // On the first load/store, just take its TBAA tag. - TBAATag = Use->getMetadata(LLVMContext::MD_tbaa); + TBAATag = UI->getMetadata(LLVMContext::MD_tbaa); } else if (TBAATag) { TBAATag = MDNode::getMostGenericTBAA(TBAATag, - Use->getMetadata(LLVMContext::MD_tbaa)); + UI->getMetadata(LLVMContext::MD_tbaa)); } - - LoopUses.push_back(Use); + + LoopUses.push_back(UI); } } @@ -853,7 +896,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, *CurAST, DL, Alignment, TBAATag); + InsertPts, PIC, *CurAST, *LI, DL, Alignment, TBAATag); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp new file mode 100644 index 000000000000..846aa703c9c3 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -0,0 +1,268 @@ +//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation combines adjacent loads. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Pass.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "load-combine" + +STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); +STATISTIC(NumLoadsCombined, "Number of loads combined"); + +namespace { +struct PointerOffsetPair { + Value *Pointer; + uint64_t Offset; +}; + +struct LoadPOPPair { + LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O) + : Load(L), POP(P), InsertOrder(O) {} + LoadPOPPair() {} + LoadInst *Load; + PointerOffsetPair POP; + /// \brief The new load needs to be created before the first load in IR order. + unsigned InsertOrder; +}; + +class LoadCombine : public BasicBlockPass { + LLVMContext *C; + const DataLayout *DL; + +public: + LoadCombine() + : BasicBlockPass(ID), + C(nullptr), DL(nullptr) { + initializeSROAPass(*PassRegistry::getPassRegistry()); + } + bool doInitialization(Function &) override; + bool runOnBasicBlock(BasicBlock &BB) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + const char *getPassName() const override { return "LoadCombine"; } + static char ID; + + typedef IRBuilder<true, TargetFolder> BuilderTy; + +private: + BuilderTy *Builder; + + PointerOffsetPair getPointerOffsetPair(LoadInst &); + bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &); + bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &); + bool combineLoads(SmallVectorImpl<LoadPOPPair> &); +}; +} + +bool LoadCombine::doInitialization(Function &F) { + DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); + C = &F.getContext(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) { + DEBUG(dbgs() << " Skipping LoadCombine -- no target data!\n"); + return false; + } + DL = &DLP->getDataLayout(); + return true; +} + +PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { + PointerOffsetPair POP; + POP.Pointer = LI.getPointerOperand(); + POP.Offset = 0; + while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { + unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType()); + APInt Offset(BitWidth, 0); + if (GEP->accumulateConstantOffset(*DL, Offset)) + POP.Offset += Offset.getZExtValue(); + else + // Can't handle GEPs with variable indices. + return POP; + POP.Pointer = GEP->getPointerOperand(); + } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) + POP.Pointer = BC->getOperand(0); + } + return POP; +} + +bool LoadCombine::combineLoads( + DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) { + bool Combined = false; + for (auto &Loads : LoadMap) { + if (Loads.second.size() < 2) + continue; + std::sort(Loads.second.begin(), Loads.second.end(), + [](const LoadPOPPair &A, const LoadPOPPair &B) { + return A.POP.Offset < B.POP.Offset; + }); + if (aggregateLoads(Loads.second)) + Combined = true; + } + return Combined; +} + +/// \brief Try to aggregate loads from a sorted list of loads to be combined. +/// +/// It is guaranteed that no writes occur between any of the loads. All loads +/// have the same base pointer. There are at least two loads. +bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { + assert(Loads.size() >= 2 && "Insufficient loads!"); + LoadInst *BaseLoad = nullptr; + SmallVector<LoadPOPPair, 8> AggregateLoads; + bool Combined = false; + uint64_t PrevOffset = -1ull; + uint64_t PrevSize = 0; + for (auto &L : Loads) { + if (PrevOffset == -1ull) { + BaseLoad = L.Load; + PrevOffset = L.POP.Offset; + PrevSize = DL->getTypeStoreSize(L.Load->getType()); + AggregateLoads.push_back(L); + continue; + } + if (L.Load->getAlignment() > BaseLoad->getAlignment()) + continue; + if (L.POP.Offset > PrevOffset + PrevSize) { + // No other load will be combinable + if (combineLoads(AggregateLoads)) + Combined = true; + AggregateLoads.clear(); + PrevOffset = -1; + continue; + } + if (L.POP.Offset != PrevOffset + PrevSize) + // This load is offset less than the size of the last load. + // FIXME: We may want to handle this case. + continue; + PrevOffset = L.POP.Offset; + PrevSize = DL->getTypeStoreSize(L.Load->getType()); + AggregateLoads.push_back(L); + } + if (combineLoads(AggregateLoads)) + Combined = true; + return Combined; +} + +/// \brief Given a list of combinable load. Combine the maximum number of them. +bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { + // Remove loads from the end while the size is not a power of 2. + unsigned TotalSize = 0; + for (const auto &L : Loads) + TotalSize += L.Load->getType()->getPrimitiveSizeInBits(); + while (TotalSize != 0 && !isPowerOf2_32(TotalSize)) + TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits(); + if (Loads.size() < 2) + return false; + + DEBUG({ + dbgs() << "***** Combining Loads ******\n"; + for (const auto &L : Loads) { + dbgs() << L.POP.Offset << ": " << *L.Load << "\n"; + } + }); + + // Find first load. This is where we put the new load. + LoadPOPPair FirstLP; + FirstLP.InsertOrder = -1u; + for (const auto &L : Loads) + if (L.InsertOrder < FirstLP.InsertOrder) + FirstLP = L; + + unsigned AddressSpace = + FirstLP.POP.Pointer->getType()->getPointerAddressSpace(); + + Builder->SetInsertPoint(FirstLP.Load); + Value *Ptr = Builder->CreateConstGEP1_64( + Builder->CreatePointerCast(Loads[0].POP.Pointer, + Builder->getInt8PtrTy(AddressSpace)), + Loads[0].POP.Offset); + LoadInst *NewLoad = new LoadInst( + Builder->CreatePointerCast( + Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), + Ptr->getType()->getPointerAddressSpace())), + Twine(Loads[0].Load->getName()) + ".combined", false, + Loads[0].Load->getAlignment(), FirstLP.Load); + + for (const auto &L : Loads) { + Builder->SetInsertPoint(L.Load); + Value *V = Builder->CreateExtractInteger( + *DL, NewLoad, cast<IntegerType>(L.Load->getType()), + L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); + L.Load->replaceAllUsesWith(V); + } + + NumLoadsCombined = NumLoadsCombined + Loads.size(); + return true; +} + +bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { + if (skipOptnoneFunction(BB) || !DL) + return false; + + IRBuilder<true, TargetFolder> + TheBuilder(BB.getContext(), TargetFolder(DL)); + Builder = &TheBuilder; + + DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; + + bool Combined = false; + unsigned Index = 0; + for (auto &I : BB) { + if (I.mayWriteToMemory() || I.mayThrow()) { + if (combineLoads(LoadMap)) + Combined = true; + LoadMap.clear(); + continue; + } + LoadInst *LI = dyn_cast<LoadInst>(&I); + if (!LI) + continue; + ++NumLoadsAnalyzed; + if (!LI->isSimple() || !LI->getType()->isIntegerTy()) + continue; + auto POP = getPointerOffsetPair(*LI); + if (!POP.Pointer) + continue; + LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++)); + } + if (combineLoads(LoadMap)) + Combined = true; + return Combined; +} + +void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +char LoadCombine::ID = 0; + +BasicBlockPass *llvm::createLoadCombinePass() { + return new LoadCombine(); +} + +INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false, + false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 9e39d2ee84f0..5ab686aa831a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -14,15 +14,16 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-delete" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/Dominators.h" using namespace llvm; +#define DEBUG_TYPE "loop-delete" + STATISTIC(NumDeleted, "Number of loops deleted"); namespace { @@ -34,17 +35,17 @@ namespace { } // Possibly eliminate loop L if it is dead. - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); @@ -61,7 +62,7 @@ namespace { char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -130,6 +131,9 @@ bool LoopDeletion::isLoopDead(Loop *L, /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + // We can only remove the loop if there is a preheader that we can // branch from after removing it. BasicBlock *preheader = L->getLoopPreheader(); @@ -202,7 +206,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. - DominatorTree &DT = getAnalysis<DominatorTree>(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SmallVector<DomTreeNode*, 8> ChildNodes; for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 952b76b822cf..a12f5a7a0334 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -41,7 +41,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -51,6 +50,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -60,6 +60,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "loop-idiom" + STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); @@ -78,9 +80,6 @@ namespace { return dyn_cast<BranchInst>(BB->getTerminator()); } - /// Return the condition of the branch terminating the given basic block. - static Value *getBrCondtion(BasicBlock *); - /// Derive the precondition block (i.e the block that guards the loop /// preheader) from the given preheader. static BasicBlock *getPrecondBb(BasicBlock *PreHead); @@ -108,22 +107,22 @@ namespace { bool preliminaryScreen(); /// Check if the given conditional branch is based on the comparison - /// beween a variable and zero, and if the variable is non-zero, the - /// control yeilds to the loop entry. If the branch matches the behavior, + /// between a variable and zero, and if the variable is non-zero, the + /// control yields to the loop entry. If the branch matches the behavior, /// the variable involved in the comparion is returned. This function will /// be called to see if the precondition and postcondition of the loop /// are in desirable form. - Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; + Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; /// Return true iff the idiom is detected in the loop. and 1) \p CntInst - /// is set to the instruction counting the pupulation bit. 2) \p CntPhi + /// is set to the instruction counting the population bit. 2) \p CntPhi /// is set to the corresponding phi node. 3) \p Var is set to the value /// whose population bits are being counted. bool detectIdiom (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; /// Insert ctpop intrinsic function and some obviously dead instructions. - void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); + void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); /// Create llvm.ctpop.* intrinsic function. CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); @@ -131,7 +130,7 @@ namespace { class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const DataLayout *TD; + const DataLayout *DL; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; @@ -140,10 +139,10 @@ namespace { static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0; + DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr; } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks); @@ -163,7 +162,7 @@ namespace { /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG. /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -174,18 +173,23 @@ namespace { AU.addPreserved<AliasAnalysis>(); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTree>(); - AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); AU.addRequired<TargetTransformInfo>(); } const DataLayout *getDataLayout() { - return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>(); + if (DL) + return DL; + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + return DL; } DominatorTree *getDominatorTree() { - return DT ? DT : (DT=&getAnalysis<DominatorTree>()); + return DT ? DT + : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); } ScalarEvolution *getScalarEvolution() { @@ -212,7 +216,7 @@ char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -244,7 +248,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, 0); + DeadInst->setOperand(op, nullptr); // If this operand just became dead, add it to the NowDeadInsts list. if (!Op->use_empty()) continue; @@ -286,17 +290,12 @@ bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { return false; } -Value *LIRUtil::getBrCondtion(BasicBlock *BB) { - BranchInst *Br = getBranch(BB); - return Br ? Br->getCondition() : 0; -} - BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { if (BasicBlock *BB = PreHead->getSinglePredecessor()) { BranchInst *Br = getBranch(BB); - return Br && Br->isConditional() ? BB : 0; + return Br && Br->isConditional() ? BB : nullptr; } - return 0; + return nullptr; } //===----------------------------------------------------------------------===// @@ -306,7 +305,7 @@ BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { //===----------------------------------------------------------------------===// NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): - LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) { + LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { } bool NclPopcountRecognize::preliminaryScreen() { @@ -343,25 +342,25 @@ bool NclPopcountRecognize::preliminaryScreen() { return true; } -Value *NclPopcountRecognize::matchCondition (BranchInst *Br, - BasicBlock *LoopEntry) const { +Value *NclPopcountRecognize::matchCondition(BranchInst *Br, + BasicBlock *LoopEntry) const { if (!Br || !Br->isConditional()) - return 0; + return nullptr; ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); if (!Cond) - return 0; + return nullptr; ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); if (!CmpZero || !CmpZero->isZero()) - return 0; + return nullptr; ICmpInst::Predicate Pred = Cond->getPredicate(); if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) return Cond->getOperand(0); - return 0; + return nullptr; } bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, @@ -392,9 +391,9 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, Value *VarX1, *VarX0; PHINode *PhiX, *CountPhi; - DefX2 = CountInst = 0; - VarX1 = VarX0 = 0; - PhiX = CountPhi = 0; + DefX2 = CountInst = nullptr; + VarX1 = VarX0 = nullptr; + PhiX = CountPhi = nullptr; LoopEntry = *(CurLoop->block_begin()); // step 1: Check if the loop-back branch is in desirable form. @@ -441,7 +440,7 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { - CountInst = NULL; + CountInst = nullptr; for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), IterE = LoopEntry->end(); Iter != IterE; Iter++) { Instruction *Inst = Iter; @@ -458,9 +457,8 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, // Check if the result of the instruction is live of the loop. bool LiveOutLoop = false; - for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); - I != E; I++) { - if ((cast<Instruction>(*I))->getParent() != LoopEntry) { + for (User *U : Inst->users()) { + if ((cast<Instruction>(U))->getParent() != LoopEntry) { LiveOutLoop = true; break; } } @@ -519,7 +517,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // TripCnt is exactly the number of iterations the loop has TripCnt = NewCount; - // If the popoulation counter's initial value is not zero, insert Add Inst. + // If the population counter's initial value is not zero, insert Add Inst. Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); if (!InitConst || !InitConst->isZero()) { @@ -596,11 +594,9 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // __builtin_ctpop(). { SmallVector<Value *, 4> CntUses; - for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end(); - I != E; I++) { - if (cast<Instruction>(*I)->getParent() != Body) - CntUses.push_back(*I); - } + for (User *U : CntInst->users()) + if (cast<Instruction>(U)->getParent() != Body) + CntUses.push_back(U); for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); } @@ -705,6 +701,9 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { } bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + CurLoop = L; // If the loop could not be converted to canonical form, it must have an @@ -746,7 +745,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, // If processing the store invalidated our iterator, start over from the // top of the block. - if (InstPtr == 0) + if (!InstPtr) I = BB->begin(); continue; } @@ -759,7 +758,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, // If processing the memset invalidated our iterator, start over from the // top of the block. - if (InstPtr == 0) + if (!InstPtr) I = BB->begin(); continue; } @@ -777,7 +776,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { Value *StorePtr = SI->getPointerOperand(); // Reject stores that are so large that they overflow an unsigned. - uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType()); + uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; @@ -786,7 +785,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { // random store we can't handle. const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) + if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) return false; // Check to see if the stride matches the size of the store. If so, then we @@ -794,7 +793,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { unsigned StoreSize = (unsigned)SizeInBits >> 3; const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) { + if (!Stride || StoreSize != Stride->getValue()->getValue()) { // TODO: Could also handle negative stride here someday, that will require // the validity check in mayLoopAccessLocation to be updated though. // Enable this to print exact negative strides. @@ -843,7 +842,7 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { // loop, which indicates a strided store. If we have something else, it's a // random store we can't handle. const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); - if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine()) + if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) return false; // Reject memsets that are so large that they overflow an unsigned. @@ -857,7 +856,7 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { // TODO: Could also handle negative stride here someday, that will require the // validity check in mayLoopAccessLocation to be updated though. - if (Stride == 0 || MSI->getLength() != Stride->getValue()) + if (!Stride || MSI->getLength() != Stride->getValue()) return false; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, @@ -905,28 +904,28 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, /// /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) { +static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { // If the value isn't a constant, we can't promote it to being in a constant // array. We could theoretically do a store to an alloca or something, but // that doesn't seem worthwhile. Constant *C = dyn_cast<Constant>(V); - if (C == 0) return 0; + if (!C) return nullptr; // Only handle simple values that are a power of two bytes in size. - uint64_t Size = TD.getTypeSizeInBits(V->getType()); + uint64_t Size = DL.getTypeSizeInBits(V->getType()); if (Size == 0 || (Size & 7) || (Size & (Size-1))) - return 0; + return nullptr; // Don't care enough about darwin/ppc to implement this. - if (TD.isBigEndian()) - return 0; + if (DL.isBigEndian()) + return nullptr; // Convert to size in bytes. Size /= 8; // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see // if the top and bottom are the same (e.g. for vectors and large integers). - if (Size > 16) return 0; + if (Size > 16) return nullptr; // If the constant is exactly 16 bytes, just use it. if (Size == 16) return C; @@ -951,7 +950,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // are stored. A store of i32 0x01020304 can never be turned into a memset, // but it can be turned into memset_pattern if the target supports it. Value *SplatValue = isBytewiseValue(StoredVal); - Constant *PatternValue = 0; + Constant *PatternValue = nullptr; unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); @@ -962,13 +961,13 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // promote the memset. CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. - PatternValue = 0; + PatternValue = nullptr; } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) && - (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + (PatternValue = getMemSetPatternValue(StoredVal, *DL))) { // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! - SplatValue = 0; + SplatValue = nullptr; } else { // Otherwise, this isn't an idiom we can transform. For example, we can't // do anything with a 3-byte store. @@ -1006,7 +1005,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = Builder.getIntPtrTy(TD, DestAS); + Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), @@ -1035,7 +1034,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, Int8PtrTy, Int8PtrTy, IntPtr, - (void*)0); + (void*)nullptr); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1120,7 +1119,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace()); + Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index a23860aad80e..ab1a9393c526 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -11,21 +11,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-instsimplify" #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "loop-instsimplify" + STATISTIC(NumSimplified, "Number of redundant instructions simplified"); namespace { @@ -36,9 +37,9 @@ namespace { initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop*, LPPassManager&); + bool runOnLoop(Loop*, LPPassManager&) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -54,7 +55,7 @@ char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", @@ -65,9 +66,15 @@ Pass *llvm::createLoopInstSimplifyPass() { } bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { - DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); + if (skipOptnoneFunction(L)) + return false; + + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; LoopInfo *LI = &getAnalysis<LoopInfo>(); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallVector<BasicBlock*, 8> ExitBlocks; @@ -109,19 +116,26 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, TD, TLI, DT); + Value *V = SimplifyInstruction(I, DL, TLI, DT); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) - Next->insert(cast<Instruction>(*UI)); + for (User *U : I->users()) + Next->insert(cast<Instruction>(U)); I->replaceAllUsesWith(V); LocalChanged = true; ++NumSimplified; } } - LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + if (res) { + // RecursivelyDeleteTriviallyDeadInstruction can remove + // more than one instruction, so simply incrementing the + // iterator does not work. When instructions get deleted + // re-iterate instead. + BI = BB->begin(); BE = BB->end(); + LocalChanged |= res; + } if (IsSubloopHeader && !isa<PHINode>(I)) break; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 643bc78f6e58..b6fbb16166dd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -11,11 +11,10 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-reroll" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/LoopPass.h" @@ -24,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -35,6 +35,8 @@ using namespace llvm; +#define DEBUG_TYPE "loop-reroll" + STATISTIC(NumRerolledLoops, "Number of rerolled loops"); static cl::opt<unsigned> @@ -124,14 +126,14 @@ namespace { initializeLoopRerollPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<TargetLibraryInfo>(); } @@ -140,7 +142,7 @@ protected: AliasAnalysis *AA; LoopInfo *LI; ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; TargetLibraryInfo *TLI; DominatorTree *DT; @@ -189,12 +191,12 @@ protected: iterator begin() { assert(Valid && "Using invalid reduction"); - return llvm::next(Instructions.begin()); + return std::next(Instructions.begin()); } const_iterator begin() const { assert(Valid && "Using invalid reduction"); - return llvm::next(Instructions.begin()); + return std::next(Instructions.begin()); } iterator end() { return Instructions.end(); } @@ -340,7 +342,7 @@ char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -353,12 +355,9 @@ Pass *llvm::createLoopRerollPass() { // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in // non-loop blocks to be outside the loop. static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (Value::use_iterator UI = I->use_begin(), - UIE = I->use_end(); UI != UIE; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (!L->contains(User)) + for (User *U : I->users()) + if (!L->contains(cast<Instruction>(U))) return true; - } return false; } @@ -408,7 +407,7 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { Instruction *C = Instructions.front(); do { - C = cast<Instruction>(*C->use_begin()); + C = cast<Instruction>(*C->user_begin()); if (C->hasOneUse()) { if (!C->isBinaryOp()) return; @@ -423,17 +422,15 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { if (Instructions.size() < 2 || !C->isSameOperationAs(Instructions.back()) || - C->use_begin() == C->use_end()) + C->use_empty()) return; // C is now the (potential) last instruction in the reduction chain. - for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); - UI != UIE; ++UI) { + for (User *U : C->users()) // The only in-loop user can be the initial PHI. - if (L->contains(cast<Instruction>(*UI))) - if (cast<Instruction>(*UI ) != Instructions.front()) + if (L->contains(cast<Instruction>(U))) + if (cast<Instruction>(U) != Instructions.front()) return; - } Instructions.push_back(C); Valid = true; @@ -483,12 +480,11 @@ void LoopReroll::collectInLoopUserSet(Loop *L, continue; if (!Final.count(I)) - for (Value::use_iterator UI = I->use_begin(), - UIE = I->use_end(); UI != UIE; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *PN = dyn_cast<PHINode>(User)) { // Ignore "wrap-around" uses to PHIs of this loop's header. - if (PN->getIncomingBlock(UI) == L->getHeader()) + if (PN->getIncomingBlock(U) == L->getHeader()) continue; } @@ -559,8 +555,8 @@ bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, if (RealIV->getNumUses() != 2) return false; const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); - Instruction *User1 = cast<Instruction>(*RealIV->use_begin()), - *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin())); + Instruction *User1 = cast<Instruction>(*RealIV->user_begin()), + *User2 = cast<Instruction>(*std::next(RealIV->user_begin())); if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) return false; const SCEVAddRecExpr *User1SCEV = @@ -616,26 +612,25 @@ bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, SmallVector<SmallInstructionVector, 32> &Roots, SmallInstructionSet &AllRoots, SmallInstructionVector &LoopIncs) { - for (Value::use_iterator UI = IV->use_begin(), - UIE = IV->use_end(); UI != UIE; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (!SE->isSCEVable(User->getType())) + for (User *U : IV->users()) { + Instruction *UI = cast<Instruction>(U); + if (!SE->isSCEVable(UI->getType())) continue; - if (User->getType() != IV->getType()) + if (UI->getType() != IV->getType()) continue; - if (!L->contains(User)) + if (!L->contains(UI)) continue; - if (hasUsesOutsideLoop(User, L)) + if (hasUsesOutsideLoop(UI, L)) continue; if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( - SE->getSCEV(User), SE->getSCEV(IV)))) { + SE->getSCEV(UI), SE->getSCEV(IV)))) { uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); if (Idx > 0 && Idx < Scale) { - Roots[Idx-1].push_back(User); - AllRoots.insert(User); + Roots[Idx-1].push_back(UI); + AllRoots.insert(UI); } else if (Idx == Scale && Inc > 1) { - LoopIncs.push_back(User); + LoopIncs.push_back(UI); } } } @@ -719,10 +714,8 @@ void LoopReroll::ReductionTracker::replaceSelected() { // Replace users with the new end-of-chain value. SmallInstructionVector Users; - for (Value::use_iterator UI = - PossibleReds[i].getReducedValue()->use_begin(), - UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI) - Users.push_back(cast<Instruction>(*UI)); + for (User *U : PossibleReds[i].getReducedValue()->users()) + Users.push_back(cast<Instruction>(U)); for (SmallInstructionVector::iterator J = Users.begin(), JE = Users.end(); J != JE; ++J) @@ -931,8 +924,10 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || - (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + ((!isSimpleLoadStore(J1) && + !isSafeToSpeculativelyExecute(J1, DL)) || + (!isSimpleLoadStore(J2) && + !isSafeToSpeculativelyExecute(J2, DL)))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (side effects prevent reordering)\n"); @@ -953,7 +948,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, bool InReduction = Reductions.isPairInSame(J1, J2); if (!(InReduction && J1->isAssociative())) { - bool Swapped = false, SomeOpMatched = false;; + bool Swapped = false, SomeOpMatched = false; for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { Value *Op2 = J2->getOperand(j); @@ -1133,12 +1128,16 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, } bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); TLI = &getAnalysis<TargetLibraryInfo>(); - DL = getAnalysisIfAvailable<DataLayout>(); - DT = &getAnalysis<DominatorTree>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 14c5655f0838..2ce58314f8ef 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-rotate" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" @@ -20,9 +19,11 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -30,7 +31,11 @@ #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; -#define MAX_HEADER_SIZE 16 +#define DEBUG_TYPE "loop-rotate" + +static cl::opt<unsigned> +DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, + cl::desc("The default maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); namespace { @@ -38,13 +43,17 @@ namespace { class LoopRotate : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopRotate() : LoopPass(ID) { + LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + if (SpecifiedMaxHeaderSize == -1) + MaxHeaderSize = DefaultRotationThreshold; + else + MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); } // LCSSA form makes instruction renaming easier. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -55,11 +64,12 @@ namespace { AU.addRequired<TargetTransformInfo>(); } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool simplifyLoopLatch(Loop *L); bool rotateLoop(Loop *L, bool SimplifiedLatch); private: + unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; }; @@ -73,11 +83,19 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -Pass *llvm::createLoopRotatePass() { return new LoopRotate(); } +Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { + return new LoopRotate(MaxHeaderSize); +} /// Rotate Loop L as many times as possible. Return true if /// the loop is rotated at least once. bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + + // Save the loop metadata. + MDNode *LoopMD = L->getLoopID(); + LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); @@ -92,6 +110,12 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { MadeChange = true; SimplifiedLatch = false; } + + // Restore the loop metadata. + // NB! We presume LoopRotation DOESN'T ADD its own metadata. + if ((MadeChange || SimplifiedLatch) && LoopMD) + L->setLoopID(LoopMD); + return MadeChange; } @@ -130,7 +154,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, for (Value::use_iterator UI = OrigHeaderVal->use_begin(), UE = OrigHeaderVal->use_end(); UI != UE; ) { // Grab the use before incrementing the iterator. - Use &U = UI.getUse(); + Use &U = *UI; // Increment the iterator before removing the use from the list. ++UI; @@ -251,8 +275,9 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { // Nuke the Latch block. assert(Latch->empty() && "unable to evacuate Latch"); LI->removeBlock(Latch); - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) - DT->eraseNode(Latch); + if (DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DTWP->getDomTree().eraseNode(Latch); Latch->eraseFromParent(); return true; } @@ -276,7 +301,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { BasicBlock *OrigLatch = L->getLoopLatch(); BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (BI == 0 || BI->isUnconditional()) + if (!BI || BI->isUnconditional()) return false; // If the loop header is not one of the loop exiting blocks then @@ -287,7 +312,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop latch already contains a branch that leaves the loop then the // loop is already rotated. - if (OrigLatch == 0) + if (!OrigLatch) return false; // Rotate if either the loop latch does *not* exit the loop, or if the loop @@ -301,11 +326,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { CodeMetrics Metrics; Metrics.analyzeBasicBlock(OrigHeader, *TTI); if (Metrics.notDuplicatable) { - DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable" + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" << " instructions: "; L->dump()); return false; } - if (Metrics.NumInsts > MAX_HEADER_SIZE) + if (Metrics.NumInsts > MaxHeaderSize) return false; } @@ -314,7 +339,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop could not be converted to canonical form, it must have an // indirectbr in it, just give up. - if (OrigPreheader == 0) + if (!OrigPreheader) return false; // Anything ScalarEvolution may know about this loop or the PHI nodes @@ -433,23 +458,25 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // The conditional branch can't be folded, handle the general case. // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + if (DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); // Everything that was dominated by the old loop header is now dominated // by the original loop preheader. Conceptually the header was merged // into the preheader, even though we reuse the actual block as a new // loop latch. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); + DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader); for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); + DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); + assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode); + assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(OrigHeader, OrigLatch); + DT.changeImmediateDominator(OrigHeader, OrigLatch); } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and @@ -459,9 +486,24 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { NewPH->setName(NewHeader->getName() + ".lr.ph"); // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. - BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); - ExitSplit->moveBefore(Exit); + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); + bool SplitLatchEdge = false; + for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(), + PE = ExitPreds.end(); + PI != PE; ++PI) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(*PI); + if (!PredLoop || PredLoop->contains(Exit)) + continue; + SplitLatchEdge |= L->getLoopLatch() == *PI; + BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this); + ExitSplit->moveBefore(Exit); + } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); } else { // We can fold the conditional branch in the preheader, this makes things // simpler. The first step is to remove the extra edge to the Exit block. @@ -471,15 +513,17 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + if (DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(NewHeader, OrigPreheader); - DT->changeImmediateDominator(OrigHeader, OrigLatch); + DT.changeImmediateDominator(NewHeader, OrigPreheader); + DT.changeImmediateDominator(OrigHeader, OrigLatch); // Brute force incremental dominator tree update. Call // findNearestCommonDominator on all CFG predecessors of each child of the // original header. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); bool Changed; @@ -492,11 +536,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { pred_iterator PI = pred_begin(BB); BasicBlock *NearestDom = *PI; for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); + NearestDom = DT.findNearestCommonDominator(NearestDom, *PI); // Remember if this changes the DomTree. if (Node->getIDom()->getBlock() != NearestDom) { - DT->changeImmediateDominator(BB, NearestDom); + DT.changeImmediateDominator(BB, NearestDom); Changed = true; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 6133962e42d7..914b56aa8167 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -53,31 +53,32 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-reduce" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Assembly/Writer.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "loop-reduce" + /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for /// bail out. This threshold is far beyond the number of users that LSR can /// conceivably solve, so it should not affect generated code, but catches the @@ -237,7 +238,15 @@ struct Formula { int64_t Scale; /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty, + /// non-empty. The canonical representation of a formula is + /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and + /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). + /// #1 enforces that the scaled register is always used when at least two + /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. + /// #2 enforces that 1 * reg is reg. + /// This invariant can be temporarly broken while building a formula. + /// However, every formula inserted into the LSRInstance must be in canonical + /// form. SmallVector<const SCEV *, 4> BaseRegs; /// ScaledReg - The 'scaled' register for this use. This should be non-null @@ -250,12 +259,18 @@ struct Formula { int64_t UnfoldedOffset; Formula() - : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0), - UnfoldedOffset(0) {} + : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), + ScaledReg(nullptr), UnfoldedOffset(0) {} void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); - unsigned getNumRegs() const; + bool isCanonical() const; + + void Canonicalize(); + + bool Unscale(); + + size_t getNumRegs() const; Type *getType() const; void DeleteBaseReg(const SCEV *&S); @@ -345,12 +360,58 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } + Canonicalize(); +} + +/// \brief Check whether or not this formula statisfies the canonical +/// representation. +/// \see Formula::BaseRegs. +bool Formula::isCanonical() const { + if (ScaledReg) + return Scale != 1 || !BaseRegs.empty(); + return BaseRegs.size() <= 1; +} + +/// \brief Helper method to morph a formula into its canonical representation. +/// \see Formula::BaseRegs. +/// Every formula having more than one base register, must use the ScaledReg +/// field. Otherwise, we would have to do special cases everywhere in LSR +/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... +/// On the other hand, 1*reg should be canonicalized into reg. +void Formula::Canonicalize() { + if (isCanonical()) + return; + // So far we did not need this case. This is easy to implement but it is + // useless to maintain dead code. Beside it could hurt compile time. + assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); + // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. + ScaledReg = BaseRegs.back(); + BaseRegs.pop_back(); + Scale = 1; + size_t BaseRegsSize = BaseRegs.size(); + size_t Try = 0; + // If ScaledReg is an invariant, try to find a variant expression. + while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg)) + std::swap(ScaledReg, BaseRegs[Try++]); +} + +/// \brief Get rid of the scale in the formula. +/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. +/// \return true if it was possible to get rid of the scale, false otherwise. +/// \note After this operation the formula may not be in the canonical form. +bool Formula::Unscale() { + if (Scale != 1) + return false; + Scale = 0; + BaseRegs.push_back(ScaledReg); + ScaledReg = nullptr; + return true; } /// getNumRegs - Return the total number of register operands used by this /// formula. This does not include register uses implied by non-constant /// addrec strides. -unsigned Formula::getNumRegs() const { +size_t Formula::getNumRegs() const { return !!ScaledReg + BaseRegs.size(); } @@ -360,7 +421,7 @@ Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : BaseGV ? BaseGV->getType() : - 0; + nullptr; } /// DeleteBaseReg - Delete the given base reg from the BaseRegs list. @@ -394,7 +455,7 @@ void Formula::print(raw_ostream &OS) const { bool First = true; if (BaseGV) { if (!First) OS << " + "; else First = false; - WriteAsOperand(OS, BaseGV, /*PrintType=*/false); + BaseGV->printAsOperand(OS, /*PrintType=*/false); } if (BaseOffset != 0) { if (!First) OS << " + "; else First = false; @@ -422,7 +483,7 @@ void Formula::print(raw_ostream &OS) const { OS << ')'; } if (UnfoldedOffset != 0) { - if (!First) OS << " + "; else First = false; + if (!First) OS << " + "; OS << "imm(" << UnfoldedOffset << ')'; } } @@ -487,11 +548,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, // Check for a division of a constant by a constant. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { if (!RC) - return 0; + return nullptr; const APInt &LA = C->getValue()->getValue(); const APInt &RA = RC->getValue()->getValue(); if (LA.srem(RA) != 0) - return 0; + return nullptr; return SE.getConstant(LA.sdiv(RA)); } @@ -500,16 +561,16 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, IgnoreSignificantBits); - if (!Step) return 0; + if (!Step) return nullptr; const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, IgnoreSignificantBits); - if (!Start) return 0; + if (!Start) return nullptr; // FlagNW is independent of the start value, step direction, and is // preserved with smaller magnitude steps. // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); } - return 0; + return nullptr; } // Distribute the sdiv over add operands, if the add doesn't overflow. @@ -520,12 +581,12 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, I != E; ++I) { const SCEV *Op = getExactSDiv(*I, RHS, SE, IgnoreSignificantBits); - if (!Op) return 0; + if (!Op) return nullptr; Ops.push_back(Op); } return SE.getAddExpr(Ops); } - return 0; + return nullptr; } // Check for a multiply operand that we can pull RHS out of. @@ -544,13 +605,13 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, } Ops.push_back(S); } - return Found ? SE.getMulExpr(Ops) : 0; + return Found ? SE.getMulExpr(Ops) : nullptr; } - return 0; + return nullptr; } // Otherwise we don't know. - return 0; + return nullptr; } /// ExtractImmediate - If S involves the addition of a constant integer value, @@ -604,7 +665,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { SCEV::FlagAnyWrap); return Result; } - return 0; + return nullptr; } /// isAddressUse - Returns true if the specified instruction is using the @@ -723,13 +784,12 @@ static bool isHighCostExpansion(const SCEV *S, // multiplication already generates this expression. if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { Value *UVal = U->getValue(); - for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); - UI != UE; ++UI) { + for (User *UR : UVal->users()) { // If U is a constant, it may be used by a ConstantExpr. - Instruction *User = dyn_cast<Instruction>(*UI); - if (User && User->getOpcode() == Instruction::Mul - && SE.isSCEVable(User->getType())) { - return SE.getSCEV(User) == Mul; + Instruction *UI = dyn_cast<Instruction>(UR); + if (UI && UI->getOpcode() == Instruction::Mul && + SE.isSCEVable(UI->getType())) { + return SE.getSCEV(UI) == Mul; } } } @@ -756,12 +816,12 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { Value *V = DeadInsts.pop_back_val(); Instruction *I = dyn_cast_or_null<Instruction>(V); - if (I == 0 || !isInstructionTriviallyDead(I)) + if (!I || !isInstructionTriviallyDead(I)) continue; for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) if (Instruction *U = dyn_cast<Instruction>(*OI)) { - *OI = 0; + *OI = nullptr; if (U->use_empty()) DeadInsts.push_back(U); } @@ -776,9 +836,18 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { namespace { class LSRUse; } -// Check if it is legal to fold 2 base registers. -static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, - const Formula &F); + +/// \brief Check if the addressing mode defined by \p F is completely +/// folded in \p LU at isel time. +/// This includes address-mode folding and special icmp tricks. +/// This function returns true if \p LU can accommodate what \p F +/// defines and up to 1 base + 1 scaled + offset. +/// In other words, if \p F has several base registers, this function may +/// still return true. Therefore, users still need to account for +/// additional base registers and/or unfolded offsets to derive an +/// accurate cost model. +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); // Get the cost of the scaling factor used in F for LU. static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F); @@ -804,7 +873,7 @@ public: bool operator<(const Cost &Other) const; - void Loose(); + void Lose(); #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. @@ -829,7 +898,7 @@ public: const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, - SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); + SmallPtrSet<const SCEV *, 16> *LoserRegs = nullptr); void print(raw_ostream &OS) const; void dump() const; @@ -864,7 +933,7 @@ void Cost::RateRegister(const SCEV *Reg, return; // Otherwise, do not consider this formula at all. - Loose(); + Lose(); return; } AddRecCost += 1; /// TODO: This should be a function of the stride. @@ -903,7 +972,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, ScalarEvolution &SE, DominatorTree &DT, SmallPtrSet<const SCEV *, 16> *LoserRegs) { if (LoserRegs && LoserRegs->count(Reg)) { - Loose(); + Lose(); return; } if (Regs.insert(Reg)) { @@ -922,10 +991,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { + assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { - Loose(); + Lose(); return; } RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); @@ -936,7 +1006,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, E = F.BaseRegs.end(); I != E; ++I) { const SCEV *BaseReg = *I; if (VisitedRegs.count(BaseReg)) { - Loose(); + Lose(); return; } RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); @@ -945,11 +1015,13 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, } // Determine how many (unfolded) adds we'll need inside the loop. - size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); + size_t NumBaseParts = F.getNumRegs(); if (NumBaseParts > 1) // Do not count the base and a possible second register if the target // allows to fold 2 registers. - NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); + NumBaseAdds += + NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); + NumBaseAdds += (F.UnfoldedOffset != 0); // Accumulate non-free scaling amounts. ScaleCost += getScalingFactorCost(TTI, LU, F); @@ -967,8 +1039,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, assert(isValid() && "invalid cost"); } -/// Loose - Set this cost to a losing value. -void Cost::Loose() { +/// Lose - Set this cost to a losing value. +void Cost::Lose() { NumRegs = ~0u; AddRecCost = ~0u; NumIVMuls = ~0u; @@ -980,21 +1052,11 @@ void Cost::Loose() { /// operator< - Choose the lower cost. bool Cost::operator<(const Cost &Other) const { - if (NumRegs != Other.NumRegs) - return NumRegs < Other.NumRegs; - if (AddRecCost != Other.AddRecCost) - return AddRecCost < Other.AddRecCost; - if (NumIVMuls != Other.NumIVMuls) - return NumIVMuls < Other.NumIVMuls; - if (NumBaseAdds != Other.NumBaseAdds) - return NumBaseAdds < Other.NumBaseAdds; - if (ScaleCost != Other.ScaleCost) - return ScaleCost < Other.ScaleCost; - if (ImmCost != Other.ImmCost) - return ImmCost < Other.ImmCost; - if (SetupCost != Other.SetupCost) - return SetupCost < Other.SetupCost; - return false; + return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, + ImmCost, SetupCost) < + std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, + Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost, + Other.SetupCost); } void Cost::print(raw_ostream &OS) const { @@ -1058,7 +1120,8 @@ struct LSRFixup { } LSRFixup::LSRFixup() - : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} + : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), + Offset(0) {} /// isUseFullyOutsideLoop - Test whether this fixup always uses its /// value outside of the given loop. @@ -1080,19 +1143,19 @@ void LSRFixup::print(raw_ostream &OS) const { // Store is common and interesting enough to be worth special-casing. if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { OS << "store "; - WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); + Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false); } else if (UserInst->getType()->isVoidTy()) OS << UserInst->getOpcodeName(); else - WriteAsOperand(OS, UserInst, /*PrintType=*/false); + UserInst->printAsOperand(OS, /*PrintType=*/false); OS << ", OperandValToReplace="; - WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); + OperandValToReplace->printAsOperand(OS, /*PrintType=*/false); for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), E = PostIncLoops.end(); I != E; ++I) { OS << ", PostIncLoop="; - WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); + (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false); } if (LUIdx != ~size_t(0)) @@ -1126,11 +1189,7 @@ struct UniquifierDenseMapInfo { } static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { - unsigned Result = 0; - for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(), - E = V.end(); I != E; ++I) - Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I); - return Result; + return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); } static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, @@ -1158,6 +1217,8 @@ public: // TODO: Add a generic icmp too? }; + typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; + KindType Kind; Type *AccessTy; @@ -1196,7 +1257,7 @@ public: MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true), RigidFormula(false), - WidestFixupType(0) {} + WidestFixupType(nullptr) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); @@ -1221,7 +1282,10 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. +/// The formula must be in canonical form. bool LSRUse::InsertFormula(const Formula &F) { + assert(F.isCanonical() && "Invalid canonical representation"); + if (!Formulae.empty() && RigidFormula) return false; @@ -1247,6 +1311,8 @@ bool LSRUse::InsertFormula(const Formula &F) { // Record registers now being used by this use. Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + if (F.ScaledReg) + Regs.insert(F.ScaledReg); return true; } @@ -1295,7 +1361,7 @@ void LSRUse::print(raw_ostream &OS) const { for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), E = Offsets.end(); I != E; ++I) { OS << *I; - if (llvm::next(I) != E) + if (std::next(I) != E) OS << ','; } OS << '}'; @@ -1313,12 +1379,10 @@ void LSRUse::dump() const { } #endif -/// isLegalUse - Test whether the use described by AM is "legal", meaning it can -/// be completely folded into the user instruction at isel time. This includes -/// address-mode folding and special icmp tricks. -static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, - Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, - bool HasBaseReg, int64_t Scale) { +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); @@ -1369,10 +1433,11 @@ static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, llvm_unreachable("Invalid LSRUse Kind!"); } -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) { +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + int64_t MinOffset, int64_t MaxOffset, + LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale) { // Check for overflow. if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != (MinOffset > 0)) @@ -1383,9 +1448,41 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, return false; MaxOffset = (uint64_t)BaseOffset + MaxOffset; - return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, - Scale) && - isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale); + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, + HasBaseReg, Scale) && + isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset, + HasBaseReg, Scale); +} + +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + int64_t MinOffset, int64_t MaxOffset, + LSRUse::KindType Kind, Type *AccessTy, + const Formula &F) { + // For the purpose of isAMCompletelyFolded either having a canonical formula + // or a scale not equal to zero is correct. + // Problems may arise from non canonical formulae having a scale == 0. + // Strictly speaking it would best to just rely on canonical formulae. + // However, when we generate the scaled formulae, we first check that the + // scaling factor is profitable before computing the actual ScaledReg for + // compile time sake. + assert((F.isCanonical() || F.Scale != 0)); + return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); +} + +/// isLegalUse - Test whether we know how to expand the current formula. +static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) { + // We know how to expand completely foldable formulae. + return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, + BaseOffset, HasBaseReg, Scale) || + // Or formulae that use a base register produced by a sum of base + // registers. + (Scale == 1 && + isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, + BaseGV, BaseOffset, true, 0)); } static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, @@ -1395,36 +1492,23 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } -static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, - const Formula &F) { - // If F is used as an Addressing Mode, it may fold one Base plus one - // scaled register. If the scaled register is nil, do as if another - // element of the base regs is a 1-scaled register. - // This is possible if BaseRegs has at least 2 registers. - - // If this is not an address calculation, this is not an addressing mode - // use. - if (LU.Kind != LSRUse::Address) - return false; - - // F is already scaled. - if (F.Scale != 0) - return false; - - // We need to keep one register for the base and one to scale. - if (F.BaseRegs.size() < 2) - return false; - - return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, - F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); - } +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { + return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, + F.Scale); +} static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F) { if (!F.Scale) return 0; - assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, F) && "Illegal formula in use."); + + // If the use is not completely folded in that instruction, we will have to + // pay an extra cost only for scale != 1. + if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F)) + return F.Scale != 1; switch (LU.Kind) { case LSRUse::Address: { @@ -1443,12 +1527,10 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); } case LSRUse::ICmpZero: - // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. - // Therefore, return 0 in case F.Scale == -1. - return F.Scale != -1; - case LSRUse::Basic: case LSRUse::Special: + // The use is completely folded, i.e., everything is folded into the + // instruction. return 0; } @@ -1473,7 +1555,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, HasBaseReg = true; } - return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, + HasBaseReg, Scale); } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, @@ -1498,36 +1581,12 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, // base and a scale. int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; - return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, - BaseOffset, HasBaseReg, Scale); + return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, + BaseOffset, HasBaseReg, Scale); } namespace { -/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding -/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. -struct UseMapDenseMapInfo { - static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() { - return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic); - } - - static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() { - return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic); - } - - static unsigned - getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) { - unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first); - Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second)); - return Result; - } - - static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS, - const std::pair<const SCEV *, LSRUse::KindType> &RHS) { - return LHS == RHS; - } -}; - /// IVInc - An individual increment in a Chain of IV increments. /// Relate an IV user to an expression that computes the IV it uses from the IV /// used by the previous link in the Chain. @@ -1552,7 +1611,7 @@ struct IVChain { SmallVector<IVInc,1> Incs; const SCEV *ExprBase; - IVChain() : ExprBase(0) {} + IVChain() : ExprBase(nullptr) {} IVChain(const IVInc &Head, const SCEV *Base) : Incs(1, Head), ExprBase(Base) {} @@ -1562,7 +1621,7 @@ struct IVChain { // begin - return the first increment in the chain. const_iterator begin() const { assert(!Incs.empty()); - return llvm::next(Incs.begin()); + return std::next(Incs.begin()); } const_iterator end() const { return Incs.end(); @@ -1656,9 +1715,7 @@ class LSRInstance { } // Support for sharing of LSRUses between LSRFixups. - typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>, - size_t, - UseMapDenseMapInfo> UseMapTy; + typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy; UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, @@ -1681,8 +1738,19 @@ class LSRInstance { void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth = 0); + + void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, unsigned Depth, + size_t Idx, bool IsScaledReg = false); void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, size_t Idx, + bool IsScaledReg = false); void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, + const SmallVectorImpl<int64_t> &Worklist, + size_t Idx, bool IsScaledReg = false); void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); @@ -1760,7 +1828,7 @@ void LSRInstance::OptimizeShadowIV() { IVUsers::const_iterator CandidateUI = UI; ++UI; Instruction *ShadowUse = CandidateUI->getUser(); - Type *DestTy = 0; + Type *DestTy = nullptr; bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV @@ -1822,7 +1890,7 @@ void LSRInstance::OptimizeShadowIV() { continue; /* Initialize new IV, double d = 0.0 in above example. */ - ConstantInt *C = 0; + ConstantInt *C = nullptr; if (Incr->getOperand(0) == PH) C = dyn_cast<ConstantInt>(Incr->getOperand(1)); else if (Incr->getOperand(1) == PH) @@ -1944,7 +2012,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // for ICMP_ULE here because the comparison would be with zero, which // isn't interesting. CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; - const SCEVNAryExpr *Max = 0; + const SCEVNAryExpr *Max = nullptr; if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { Pred = ICmpInst::ICMP_SLE; Max = S; @@ -1987,7 +2055,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // Check the right operand of the select, and remember it, as it will // be used in the new comparison instruction. - Value *NewRHS = 0; + Value *NewRHS = nullptr; if (ICmpInst::isTrueWhenEqual(Pred)) { // Look for n+1, and grab n. if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) @@ -2057,7 +2125,7 @@ LSRInstance::OptimizeLoopTermCond() { continue; // Search IVUsesByStride to find Cond's IVUse if there is one. - IVStrideUse *CondUse = 0; + IVStrideUse *CondUse = nullptr; ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); if (!FindIVUserForCond(Cond, CondUse)) continue; @@ -2110,12 +2178,12 @@ LSRInstance::OptimizeLoopTermCond() { // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, /*BaseOffset=*/ 0, /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, /*BaseOffset=*/ 0, /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; @@ -2185,23 +2253,25 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // the uses will have all its uses outside the loop, for example. if (LU.Kind != Kind) return false; + + // Check for a mismatched access type, and fall back conservatively as needed. + // TODO: Be less conservative when the type is similar and can use the same + // addressing modes. + if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) + NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { - if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { - if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, NewOffset - LU.MinOffset, HasBaseReg)) return false; NewMaxOffset = NewOffset; } - // Check for a mismatched access type, and fall back conservatively as needed. - // TODO: Be less conservative when the type is similar and can use the same - // addressing modes. - if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) - NewAccessTy = Type::getVoidTy(AccessTy->getContext()); // Update the use. LU.MinOffset = NewMinOffset; @@ -2222,14 +2292,14 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, Offset, /*HasBaseReg=*/ true)) { Expr = Copy; Offset = 0; } std::pair<UseMapTy::iterator, bool> P = - UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); + UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0)); if (!P.second) { // A use already existed with this base. size_t LUIdx = P.first->second; @@ -2306,7 +2376,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, } // Nothing looked good. - return 0; + return nullptr; } void LSRInstance::CollectInterestingTypesAndFactors() { @@ -2338,7 +2408,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() { for (SmallSetVector<const SCEV *, 4>::const_iterator I = Strides.begin(), E = Strides.end(); I != E; ++I) for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = - llvm::next(I); NewStrideIter != E; ++NewStrideIter) { + std::next(I); NewStrideIter != E; ++NewStrideIter) { const SCEV *OldStride = *I; const SCEV *NewStride = *NewStrideIter; @@ -2424,7 +2494,7 @@ static const SCEV *getExprBase(const SCEV *S) { default: // uncluding scUnknown. return S; case scConstant: - return 0; + return nullptr; case scTruncate: return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); case scZeroExtend: @@ -2515,7 +2585,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { --cost; } - const SCEV *LastIncExpr = 0; + const SCEV *LastIncExpr = nullptr; unsigned NumConstIncrements = 0; unsigned NumVarIncrements = 0; unsigned NumReusedIncrements = 0; @@ -2574,7 +2644,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, // Visit all existing chains. Check if its IVOper can be computed as a // profitable loop invariant increment from the last link in the Chain. unsigned ChainIdx = 0, NChains = IVChainVec.size(); - const SCEV *LastIncExpr = 0; + const SCEV *LastIncExpr = nullptr; for (; ChainIdx < NChains; ++ChainIdx) { IVChain &Chain = IVChainVec[ChainIdx]; @@ -2646,9 +2716,8 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, // they will eventually be used be the current chain, or can be computed // from one of the chain increments. To be more precise we could // transitively follow its user and only add leaf IV users to the set. - for (Value::use_iterator UseIter = IVOper->use_begin(), - UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) { - Instruction *OtherUse = dyn_cast<Instruction>(*UseIter); + for (User *U : IVOper->users()) { + Instruction *OtherUse = dyn_cast<Instruction>(U); if (!OtherUse) continue; // Uses in the chain will no longer be uses if the chain is formed. @@ -2738,7 +2807,7 @@ void LSRInstance::CollectChains() { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); if (UniqueOperands.insert(IVOpInst)) ChainInstruction(I, IVOpInst, ChainUsersVec); - IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); + IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } } // Continue walking down the instructions. } // Continue walking down the domtree. @@ -2795,7 +2864,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, - getAccessType(UserInst), /*BaseGV=*/ 0, + getAccessType(UserInst), /*BaseGV=*/ nullptr, IncOffset, /*HaseBaseReg=*/ false)) return false; @@ -2813,7 +2882,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), IVOpEnd, L, SE); - Value *IVSrc = 0; + Value *IVSrc = nullptr; while (IVOpIter != IVOpEnd) { IVSrc = getWideOperand(*IVOpIter); @@ -2829,7 +2898,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, || SE.getSCEV(IVSrc) == Head.IncExpr) { break; } - IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); + IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } if (IVOpIter == IVOpEnd) { // Gracefully give up on this chain. @@ -2840,7 +2909,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); - const SCEV *LeftOverExpr = 0; + const SCEV *LeftOverExpr = nullptr; for (IVChain::const_iterator IncI = Chain.begin(), IncE = Chain.end(); IncI != IncE; ++IncI) { @@ -2871,7 +2940,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, TTI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; - LeftOverExpr = 0; + LeftOverExpr = nullptr; } } Type *OperTy = IncI->IVOperand->getType(); @@ -2926,7 +2995,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = UI->getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - Type *AccessTy = 0; + Type *AccessTy = nullptr; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -2957,7 +3026,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. - N = TransformForPostIncUse(Normalize, N, CI, 0, + N = TransformForPostIncUse(Normalize, N, CI, nullptr, LF.PostIncLoops, SE, DT); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); @@ -3032,6 +3101,9 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { + // Do not insert formula that we will not be able to expand. + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && + "Formula is illegal"); if (!LU.InsertFormula(F)) return false; @@ -3059,18 +3131,17 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { Worklist.push_back(D->getLHS()); Worklist.push_back(D->getRHS()); - } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { - if (!Inserted.insert(U)) continue; - const Value *V = U->getValue(); + } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) { + if (!Inserted.insert(US)) continue; + const Value *V = US->getValue(); if (const Instruction *Inst = dyn_cast<Instruction>(V)) { // Look for instructions defined outside the loop. if (L->contains(Inst)) continue; } else if (isa<UndefValue>(V)) // Undef doesn't have a live range, so it doesn't matter. continue; - for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - const Instruction *UserInst = dyn_cast<Instruction>(*UI); + for (const Use &U : V->uses()) { + const Instruction *UserInst = dyn_cast<Instruction>(U.getUser()); // Ignore non-instructions. if (!UserInst) continue; @@ -3082,7 +3153,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? UserInst->getParent() : cast<PHINode>(UserInst)->getIncomingBlock( - PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); + PHINode::getIncomingValueNumForOperand(U.getOperandNo())); if (!DT.dominates(L->getHeader(), UseBB)) continue; // Ignore uses which are part of other SCEV expressions, to avoid @@ -3092,7 +3163,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { // If the user is a no-op, look through to its uses. if (!isa<SCEVUnknown>(UserS)) continue; - if (UserS == U) { + if (UserS == US) { Worklist.push_back( SE.getUnknown(const_cast<Instruction *>(UserInst))); continue; @@ -3100,7 +3171,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { } // Ignore icmp instructions which are already being analyzed. if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { - unsigned OtherIdx = !UI.getOperandNo(); + unsigned OtherIdx = !U.getOperandNo(); Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) continue; @@ -3108,8 +3179,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = const_cast<Instruction *>(UserInst); - LF.OperandValToReplace = UI.getUse(); - std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0); + LF.OperandValToReplace = U; + std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; @@ -3118,7 +3189,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) LU.WidestFixupType = LF.OperandValToReplace->getType(); - InsertSupplementalFormula(U, LU, LF.LUIdx); + InsertSupplementalFormula(US, LU, LF.LUIdx); CountRegisters(LU.Formulae.back(), Uses.size() - 1); break; } @@ -3148,7 +3219,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, if (Remainder) Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); } - return 0; + return nullptr; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. if (AR->getStart()->isZero()) @@ -3160,7 +3231,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, // does not pertain to this loop. if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); - Remainder = 0; + Remainder = nullptr; } if (Remainder != AR->getStart()) { if (!Remainder) @@ -3182,90 +3253,110 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); if (Remainder) Ops.push_back(SE.getMulExpr(C, Remainder)); - return 0; + return nullptr; } } return S; } -/// GenerateReassociations - Split out subexpressions from adds and the bases of -/// addrecs. -void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, - Formula Base, - unsigned Depth) { - // Arbitrarily cap recursion to protect compile time. - if (Depth >= 3) return; +/// \brief Helper function for LSRInstance::GenerateReassociations. +void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, + unsigned Depth, size_t Idx, + bool IsScaledReg) { + const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + SmallVector<const SCEV *, 8> AddOps; + const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); + if (Remainder) + AddOps.push_back(Remainder); + + if (AddOps.size() == 1) + return; - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { - const SCEV *BaseReg = Base.BaseRegs[i]; + for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), + JE = AddOps.end(); + J != JE; ++J) { - SmallVector<const SCEV *, 8> AddOps; - const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); - if (Remainder) - AddOps.push_back(Remainder); + // Loop-variant "unknown" values are uninteresting; we won't be able to + // do anything meaningful with them. + if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) + continue; - if (AddOps.size() == 1) continue; + // Don't pull a constant into a register if the constant could be folded + // into an immediate field. + if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, *J, Base.getNumRegs() > 1)) + continue; - for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), - JE = AddOps.end(); J != JE; ++J) { + // Collect all operands except *J. + SmallVector<const SCEV *, 8> InnerAddOps( + ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); + InnerAddOps.append(std::next(J), + ((const SmallVector<const SCEV *, 8> &)AddOps).end()); + + // Don't leave just a constant behind in a register if the constant could + // be folded into an immediate field. + if (InnerAddOps.size() == 1 && + isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) + continue; - // Loop-variant "unknown" values are uninteresting; we won't be able to - // do anything meaningful with them. - if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) - continue; + const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); + if (InnerSum->isZero()) + continue; + Formula F = Base; - // Don't pull a constant into a register if the constant could be folded - // into an immediate field. - if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, *J, Base.getNumRegs() > 1)) - continue; + // Add the remaining pieces of the add back into the new formula. + const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); + if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { + F.UnfoldedOffset = + (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); + if (IsScaledReg) + F.ScaledReg = nullptr; + else + F.BaseRegs.erase(F.BaseRegs.begin() + Idx); + } else if (IsScaledReg) + F.ScaledReg = InnerSum; + else + F.BaseRegs[Idx] = InnerSum; + + // Add J as its own register, or an unfolded immediate. + const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); + if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) + F.UnfoldedOffset = + (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); + else + F.BaseRegs.push_back(*J); + // We may have changed the number of register in base regs, adjust the + // formula accordingly. + F.Canonicalize(); - // Collect all operands except *J. - SmallVector<const SCEV *, 8> InnerAddOps - (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); - InnerAddOps.append - (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end()); - - // Don't leave just a constant behind in a register if the constant could - // be folded into an immediate field. - if (InnerAddOps.size() == 1 && - isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) - continue; + if (InsertFormula(LU, LUIdx, F)) + // If that formula hadn't been seen before, recurse to find more like + // it. + GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1); + } +} - const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); - if (InnerSum->isZero()) - continue; - Formula F = Base; +/// GenerateReassociations - Split out subexpressions from adds and the bases of +/// addrecs. +void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, + Formula Base, unsigned Depth) { + assert(Base.isCanonical() && "Input must be in the canonical form"); + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) + return; - // Add the remaining pieces of the add back into the new formula. - const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); - if (InnerSumSC && - SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - InnerSumSC->getValue()->getZExtValue())) { - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + - InnerSumSC->getValue()->getZExtValue(); - F.BaseRegs.erase(F.BaseRegs.begin() + i); - } else - F.BaseRegs[i] = InnerSum; - - // Add J as its own register, or an unfolded immediate. - const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); - if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - SC->getValue()->getZExtValue())) - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + - SC->getValue()->getZExtValue(); - else - F.BaseRegs.push_back(*J); + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i); - if (InsertFormula(LU, LUIdx, F)) - // If that formula hadn't been seen before, recurse to find more like - // it. - GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1); - } - } + if (Base.Scale == 1) + GenerateReassociationsImpl(LU, LUIdx, Base, Depth, + /* Idx */ -1, /* IsScaledReg */ true); } /// GenerateCombinations - Generate a formula consisting of all of the @@ -3273,8 +3364,12 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. - if (Base.BaseRegs.size() <= 1) return; + if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1) + return; + // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before + // processing the formula. + Base.Unscale(); Formula F = Base; F.BaseRegs.clear(); SmallVector<const SCEV *, 4> Ops; @@ -3294,29 +3389,87 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); + F.Canonicalize(); (void)InsertFormula(LU, LUIdx, F); } } } +/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets. +void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, size_t Idx, + bool IsScaledReg) { + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + GlobalValue *GV = ExtractSymbol(G, SE); + if (G->isZero() || !GV) + return; + Formula F = Base; + F.BaseGV = GV; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) + return; + if (IsScaledReg) + F.ScaledReg = G; + else + F.BaseRegs[Idx] = G; + (void)InsertFormula(LU, LUIdx, F); +} + /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. if (Base.BaseGV) return; - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { - const SCEV *G = Base.BaseRegs[i]; - GlobalValue *GV = ExtractSymbol(G, SE); - if (G->isZero() || !GV) - continue; + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i); + if (Base.Scale == 1) + GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1, + /* IsScaledReg */ true); +} + +/// \brief Helper function for LSRInstance::GenerateConstantOffsets. +void LSRInstance::GenerateConstantOffsetsImpl( + LSRUse &LU, unsigned LUIdx, const Formula &Base, + const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), + E = Worklist.end(); + I != E; ++I) { Formula F = Base; - F.BaseGV = GV; - if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) - continue; - F.BaseRegs[i] = G; - (void)InsertFormula(LU, LUIdx, F); + F.BaseOffset = (uint64_t)Base.BaseOffset - *I; + if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, + LU.AccessTy, F)) { + // Add the offset to the base register. + const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); + // If it cancelled out, drop the base register, otherwise update it. + if (NewG->isZero()) { + if (IsScaledReg) { + F.Scale = 0; + F.ScaledReg = nullptr; + } else + F.DeleteBaseReg(F.BaseRegs[Idx]); + F.Canonicalize(); + } else if (IsScaledReg) + F.ScaledReg = NewG; + else + F.BaseRegs[Idx] = NewG; + + (void)InsertFormula(LU, LUIdx, F); + } } + + int64_t Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm == 0) + return; + Formula F = Base; + F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) + return; + if (IsScaledReg) + F.ScaledReg = G; + else + F.BaseRegs[Idx] = G; + (void)InsertFormula(LU, LUIdx, F); } /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. @@ -3329,38 +3482,11 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, if (LU.MaxOffset != LU.MinOffset) Worklist.push_back(LU.MaxOffset); - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { - const SCEV *G = Base.BaseRegs[i]; - - for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), - E = Worklist.end(); I != E; ++I) { - Formula F = Base; - F.BaseOffset = (uint64_t)Base.BaseOffset - *I; - if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, - LU.AccessTy, F)) { - // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); - // If it cancelled out, drop the base register, otherwise update it. - if (NewG->isZero()) { - std::swap(F.BaseRegs[i], F.BaseRegs.back()); - F.BaseRegs.pop_back(); - } else - F.BaseRegs[i] = NewG; - - (void)InsertFormula(LU, LUIdx, F); - } - } - - int64_t Imm = ExtractImmediate(G, SE); - if (G->isZero() || Imm == 0) - continue; - Formula F = Base; - F.BaseOffset = (uint64_t)F.BaseOffset + Imm; - if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) - continue; - F.BaseRegs[i] = G; - (void)InsertFormula(LU, LUIdx, F); - } + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i); + if (Base.Scale == 1) + GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1, + /* IsScaledReg */ true); } /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up @@ -3460,7 +3586,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (!IntTy) return; // If this Formula already has a scaled register, we can't add another one. - if (Base.Scale != 0) return; + // Try to unscale the formula to generate a better scale. + if (Base.Scale != 0 && !Base.Unscale()) + return; + + assert(Base.Scale == 0 && "Unscale did not did its job!"); // Check each interesting stride. for (SmallSetVector<int64_t, 8>::const_iterator @@ -3501,6 +3631,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { Formula F = Base; F.ScaledReg = Quotient; F.DeleteBaseReg(F.BaseRegs[i]); + // The canonical representation of 1*reg is reg, which is already in + // Base. In that case, do not try to insert the formula, it will be + // rejected anyway. + if (F.Scale == 1 && F.BaseRegs.empty()) + continue; (void)InsertFormula(LU, LUIdx, F); } } @@ -3626,8 +3761,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. ImmMapTy::const_iterator OtherImms[] = { - Imms.begin(), prior(Imms.end()), - Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + Imms.begin(), std::prev(Imms.end()), + Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) / + 2) }; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; @@ -3664,7 +3800,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // TODO: Use a more targeted data structure. for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { - const Formula &F = LU.Formulae[L]; + Formula F = LU.Formulae[L]; + // FIXME: The code for the scaled and unscaled registers looks + // very similar but slightly different. Investigate if they + // could be merged. That way, we would not have to unscale the + // Formula. + F.Unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; @@ -3690,6 +3831,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { continue; // OK, looks good. + NewF.Canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3723,6 +3865,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { goto skip_formula; // Ok, looks good. + NewF.Canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -3976,7 +4119,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - if (F.BaseOffset == 0 || F.Scale != 0) + if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) continue; LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); @@ -4073,7 +4216,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { // Pick the register which is used by the most LSRUses, which is likely // to be a good reuse register candidate. - const SCEV *Best = 0; + const SCEV *Best = nullptr; unsigned BestNum = 0; for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); I != E; ++I) { @@ -4170,19 +4313,22 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - // Ignore formulae which do not use any of the required registers. - bool SatisfiedReqReg = true; + // Ignore formulae which may not be ideal in terms of register reuse of + // ReqRegs. The formula should use all required registers before + // introducing new ones. + int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), JE = ReqRegs.end(); J != JE; ++J) { const SCEV *Reg = *J; - if ((!F.ScaledReg || F.ScaledReg != Reg) && - std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == + if ((F.ScaledReg && F.ScaledReg == Reg) || + std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) != F.BaseRegs.end()) { - SatisfiedReqReg = false; - break; + --NumReqRegsToFind; + if (NumReqRegsToFind == 0) + break; } } - if (!SatisfiedReqReg) { + if (NumReqRegsToFind != 0) { // If none of the formulae satisfied the required registers, then we could // clear ReqRegs and try again. Currently, we simply give up in this case. continue; @@ -4222,7 +4368,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SmallVector<const Formula *, 8> Workspace; Cost SolutionCost; - SolutionCost.Loose(); + SolutionCost.Lose(); Cost CurCost; SmallPtrSet<const SCEV *, 16> CurRegs; DenseSet<const SCEV *> VisitedRegs; @@ -4280,7 +4426,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, } bool AllDominate = true; - Instruction *BetterPos = 0; + Instruction *BetterPos = nullptr; Instruction *Tentative = IDom->getTerminator(); for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), E = Inputs.end(); I != E; ++I) { @@ -4293,7 +4439,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || !DT.dominates(Inst, BetterPos))) - BetterPos = llvm::next(BasicBlock::iterator(Inst)); + BetterPos = std::next(BasicBlock::iterator(Inst)); } if (!AllDominate) break; @@ -4419,11 +4565,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); } // Expand the ScaledReg portion. - Value *ICmpScaledV = 0; + Value *ICmpScaledV = nullptr; if (F.Scale != 0) { const SCEV *ScaledS = F.ScaledReg; @@ -4434,25 +4580,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF, Loops, SE, DT); if (LU.Kind == LSRUse::ICmpZero) { - // An interesting way of "folding" with an icmp is to use a negated - // scale, which we'll implement by inserting it into the other operand - // of the icmp. - assert(F.Scale == -1 && - "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); + // Expand ScaleReg as if it was part of the base regs. + if (F.Scale == 1) + Ops.push_back( + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); + else { + // An interesting way of "folding" with an icmp is to use a negated + // scale, which we'll implement by inserting it into the other operand + // of the icmp. + assert(F.Scale == -1 && + "The only scale supported by ICmpZero uses is -1!"); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); + } } else { // Otherwise just expand the scaled register and an explicit scale, // which is expected to be matched as part of the address. // Flush the operand list to suppress SCEVExpander hoisting address modes. - if (!Ops.empty() && LU.Kind == LSRUse::Address) { + // Unless the addressing mode will not be folded. + if (!Ops.empty() && LU.Kind == LSRUse::Address && + isAMCompletelyFolded(TTI, LU, F)) { Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); - ScaledS = SE.getMulExpr(ScaledS, - SE.getConstant(ScaledS->getType(), F.Scale)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); + if (F.Scale != 1) + ScaledS = + SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); Ops.push_back(ScaledS); } } @@ -4530,7 +4685,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } CI->setOperand(1, ICmpScaledV); } else { - assert(F.Scale == 0 && + // A scale of 1 means that the scale has been expanded as part of the + // base regs. + assert((F.Scale == 0 || F.Scale == 1) && "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), @@ -4571,7 +4728,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, Loop *PNLoop = LI.getLoopFor(Parent); if (!PNLoop || Parent != PNLoop->getHeader()) { // Split the critical edge. - BasicBlock *NewBB = 0; + BasicBlock *NewBB = nullptr; if (!Parent->isLandingPad()) { NewBB = SplitCriticalEdge(BB, Parent, P, /*MergeIdenticalEdges=*/true, @@ -4600,7 +4757,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, } std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = - Inserted.insert(std::make_pair(BB, static_cast<Value *>(0))); + Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr))); if (!Pair.second) PN->setIncomingValue(i, Pair.first->second); else { @@ -4707,9 +4864,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, LSRInstance::LSRInstance(Loop *L, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), + DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), + LI(P->getAnalysis<LoopInfo>()), TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), - IVIncInsertPos(0) { + IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4746,7 +4904,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) #endif // DEBUG DEBUG(dbgs() << "\nLSR on loop "; - WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); + L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); dbgs() << ":\n"); // First, perform some low-level loop optimizations. @@ -4876,8 +5034,8 @@ public: LoopStrengthReduce(); private: - bool runOnLoop(Loop *L, LPPassManager &LPM); - void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; }; } @@ -4886,7 +5044,7 @@ char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_DEPENDENCY(LoopInfo) @@ -4911,8 +5069,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); // Requiring LoopSimplify a second time here prevents IVUsers from running @@ -4924,6 +5082,9 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { + if (skipOptnoneFunction(L)) + return false; + bool Changed = false; // Run the main LSR transformation. @@ -4937,10 +5098,9 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif - unsigned numFolded = - Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), - DeadInsts, - &getAnalysis<TargetTransformInfo>()); + unsigned numFolded = Rewriter.replaceCongruentIVs( + L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts, + &getAnalysis<TargetTransformInfo>()); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 08ac38dec5dd..935f289f040f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -12,14 +12,16 @@ // counts of loops easily. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-unroll" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -28,13 +30,16 @@ using namespace llvm; +#define DEBUG_TYPE "loop-unroll" + static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, cl::desc("The cut-off point for automatic loop unrolling")); static cl::opt<unsigned> UnrollCount("unroll-count", cl::init(0), cl::Hidden, - cl::desc("Use this unroll count for all loops, for testing purposes")); + cl::desc("Use this unroll count for all loops including those with " + "unroll_count pragma values, for testing purposes")); static cl::opt<bool> UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, @@ -45,6 +50,11 @@ static cl::opt<bool> UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); +static cl::opt<unsigned> +PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, + cl::desc("Unrolled size limit for loops with an unroll(enable) or " + "unroll_count pragma.")); + namespace { class LoopUnroll : public LoopPass { public: @@ -86,12 +96,12 @@ namespace { bool UserAllowPartial; // CurrentAllowPartial is user-specified. bool UserRuntime; // CurrentRuntime is user-specified. - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -105,7 +115,67 @@ namespace { // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. // For now, recreate dom info, if loop is unrolled. - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + // Fill in the UnrollingPreferences parameter with values from the + // TargetTransformationInfo. + void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, + TargetTransformInfo::UnrollingPreferences &UP) { + UP.Threshold = CurrentThreshold; + UP.OptSizeThreshold = OptSizeUnrollThreshold; + UP.PartialThreshold = CurrentThreshold; + UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; + UP.Count = CurrentCount; + UP.MaxCount = UINT_MAX; + UP.Partial = CurrentAllowPartial; + UP.Runtime = CurrentRuntime; + TTI.getUnrollingPreferences(L, UP); + } + + // Select and return an unroll count based on parameters from + // user, unroll preferences, unroll pragmas, or a heuristic. + // SetExplicitly is set to true if the unroll count is is set by + // the user or a pragma rather than selected heuristically. + unsigned + selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma, + unsigned PragmaCount, + const TargetTransformInfo::UnrollingPreferences &UP, + bool &SetExplicitly); + + + // Select threshold values used to limit unrolling based on a + // total unrolled size. Parameters Threshold and PartialThreshold + // are set to the maximum unrolled size for fully and partially + // unrolled loops respectively. + void selectThresholds(const Loop *L, bool HasPragma, + const TargetTransformInfo::UnrollingPreferences &UP, + unsigned &Threshold, unsigned &PartialThreshold) { + // Determine the current unrolling threshold. While this is + // normally set from UnrollThreshold, it is overridden to a + // smaller value if the current function is marked as + // optimize-for-size, and the unroll threshold was not user + // specified. + Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; + PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; + if (!UserThreshold && + L->getHeader()->getParent()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) { + Threshold = UP.OptSizeThreshold; + PartialThreshold = UP.PartialOptSizeThreshold; + } + if (HasPragma) { + // If the loop has an unrolling pragma, we want to be more + // aggressive with unrolling limits. Set thresholds to at + // least the PragmaTheshold value which is larger than the + // default limits. + if (Threshold != NoThreshold) + Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold); + if (PartialThreshold != NoThreshold) + PartialThreshold = + std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold); + } } }; } @@ -124,6 +194,10 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, return new LoopUnroll(Threshold, Count, AllowPartial, Runtime); } +Pass *llvm::createSimpleLoopUnrollPass() { + return llvm::createLoopUnrollPass(-1, -1, 0, 0); +} + /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, @@ -145,7 +219,144 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, return LoopSize; } +// Returns the value associated with the given metadata node name (for +// example, "llvm.loop.unroll.count"). If no such named metadata node +// exists, then nullptr is returned. +static const ConstantInt *GetUnrollMetadataValue(const Loop *L, + StringRef Name) { + MDNode *LoopID = L->getLoopID(); + if (!LoopID) return nullptr; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (!MD) continue; + + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) continue; + + if (Name.equals(S->getString())) { + assert(MD->getNumOperands() == 2 && + "Unroll hint metadata should have two operands."); + return cast<ConstantInt>(MD->getOperand(1)); + } + } + return nullptr; +} + +// Returns true if the loop has an unroll(enable) pragma. +static bool HasUnrollEnablePragma(const Loop *L) { + const ConstantInt *EnableValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); + return (EnableValue && EnableValue->getZExtValue()); +} + +// Returns true if the loop has an unroll(disable) pragma. +static bool HasUnrollDisablePragma(const Loop *L) { + const ConstantInt *EnableValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); + return (EnableValue && !EnableValue->getZExtValue()); +} + +// If loop has an unroll_count pragma return the (necessarily +// positive) value from the pragma. Otherwise return 0. +static unsigned UnrollCountPragmaValue(const Loop *L) { + const ConstantInt *CountValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.count"); + if (CountValue) { + unsigned Count = CountValue->getZExtValue(); + assert(Count >= 1 && "Unroll count must be positive."); + return Count; + } + return 0; +} + +// Remove existing unroll metadata and add unroll disable metadata to +// indicate the loop has already been unrolled. This prevents a loop +// from being unrolled more than is directed by a pragma if the loop +// unrolling pass is run more than once (which it generally is). +static void SetLoopAlreadyUnrolled(Loop *L) { + MDNode *LoopID = L->getLoopID(); + if (!LoopID) return; + + // First remove any existing loop unrolling metadata. + SmallVector<Value *, 4> Vals; + // Reserve first location for self reference to the LoopID metadata node. + Vals.push_back(nullptr); + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + } + + // Add unroll(disable) metadata to disable future unrolling. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Value *, 2> DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.enable")); + DisableOperands.push_back(ConstantInt::get(Type::getInt1Ty(Context), 0)); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + Vals.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, Vals); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + LoopID->replaceAllUsesWith(NewLoopID); +} + +unsigned LoopUnroll::selectUnrollCount( + const Loop *L, unsigned TripCount, bool HasEnablePragma, + unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, + bool &SetExplicitly) { + SetExplicitly = true; + + // User-specified count (either as a command-line option or + // constructor parameter) has highest precedence. + unsigned Count = UserCount ? CurrentCount : 0; + + // If there is no user-specified count, unroll pragmas have the next + // highest precendence. + if (Count == 0) { + if (PragmaCount) { + Count = PragmaCount; + } else if (HasEnablePragma) { + // unroll(enable) pragma without an unroll_count pragma + // indicates to unroll loop fully. + Count = TripCount; + } + } + + if (Count == 0) + Count = UP.Count; + + if (Count == 0) { + SetExplicitly = false; + if (TripCount == 0) + // Runtime trip count. + Count = UnrollRuntimeCount; + else + // Conservative heuristic: if we know the trip count, see if we can + // completely unroll (subject to the threshold, checked below); otherwise + // try to find greatest modulo of the trip count which is still under + // threshold value. + Count = TripCount; + } + if (TripCount && Count > TripCount) + return TripCount; + return Count; +} + bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + LoopInfo *LI = &getAnalysis<LoopInfo>(); ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); @@ -153,26 +364,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); - (void)Header; + + if (HasUnrollDisablePragma(L)) { + return false; + } + bool HasEnablePragma = HasUnrollEnablePragma(L); + unsigned PragmaCount = UnrollCountPragmaValue(L); + bool HasPragma = HasEnablePragma || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; - UP.Threshold = CurrentThreshold; - UP.OptSizeThreshold = OptSizeUnrollThreshold; - UP.Count = CurrentCount; - UP.Partial = CurrentAllowPartial; - UP.Runtime = CurrentRuntime; - TTI.getUnrollingPreferences(L, UP); - - // Determine the current unrolling threshold. While this is normally set - // from UnrollThreshold, it is overridden to a smaller value if the current - // function is marked as optimize-for-size, and the unroll threshold was - // not user specified. - unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; - if (!UserThreshold && - Header->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) - Threshold = UP.OptSizeThreshold; + getUnrollingPreferences(L, TTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -186,74 +387,121 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); } - bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime; + // Select an initial unroll count. This may be reduced later based + // on size thresholds. + bool CountSetExplicitly; + unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount, + UP, CountSetExplicitly); + + unsigned NumInlineCandidates; + bool notDuplicatable; + unsigned LoopSize = + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + uint64_t UnrolledSize = (uint64_t)LoopSize * Count; + if (notDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" + << " instructions.\n"); + return false; + } + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } - // Use a default unroll-count if the user doesn't specify a value - // and the trip count is a run-time value. The default is different - // for run-time or compile-time trip count loops. - unsigned Count = UserCount ? CurrentCount : UP.Count; - if (Runtime && Count == 0 && TripCount == 0) - Count = UnrollRuntimeCount; + unsigned Threshold, PartialThreshold; + selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold); - if (Count == 0) { - // Conservative heuristic: if we know the trip count, see if we can - // completely unroll (subject to the threshold, checked below); otherwise - // try to find greatest modulo of the trip count which is still under - // threshold value. - if (TripCount == 0) - return false; - Count = TripCount; + // Given Count, TripCount and thresholds determine the type of + // unrolling which is to be performed. + enum { Full = 0, Partial = 1, Runtime = 2 }; + int Unrolling; + if (TripCount && Count == TripCount) { + if (Threshold != NoThreshold && UnrolledSize > Threshold) { + DEBUG(dbgs() << " Too large to fully unroll with count: " << Count + << " because size: " << UnrolledSize << ">" << Threshold + << "\n"); + Unrolling = Partial; + } else { + Unrolling = Full; + } + } else if (TripCount && Count < TripCount) { + Unrolling = Partial; + } else { + Unrolling = Runtime; } - // Enforce the threshold. - if (Threshold != NoThreshold) { - unsigned NumInlineCandidates; - bool notDuplicatable; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, - notDuplicatable, TTI); - DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - if (notDuplicatable) { - DEBUG(dbgs() << " Not unrolling loop which contains non duplicatable" - << " instructions.\n"); + // Reduce count based on the type of unrolling and the threshold values. + unsigned OriginalCount = Count; + bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; + if (Unrolling == Partial) { + bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + if (!AllowPartial && !CountSetExplicitly) { + DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); return false; } - if (NumInlineCandidates != 0) { - DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { + // Reduce unroll count to be modulo of TripCount for partial unrolling. + Count = PartialThreshold / LoopSize; + while (Count != 0 && TripCount % Count != 0) + Count--; + } + } else if (Unrolling == Runtime) { + if (!AllowRuntime && !CountSetExplicitly) { + DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " + << "-unroll-runtime not given\n"); return false; } - uint64_t Size = (uint64_t)LoopSize*Count; - if (TripCount != 1 && Size > Threshold) { - DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << Size << ">" << Threshold << "\n"); - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; - if (!AllowPartial && !(Runtime && TripCount == 0)) { - DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); - return false; - } - if (TripCount) { - // Reduce unroll count to be modulo of TripCount for partial unrolling - Count = Threshold / LoopSize; - while (Count != 0 && TripCount%Count != 0) - Count--; - } - else if (Runtime) { - // Reduce unroll count to be a lower power-of-two value - while (Count != 0 && Size > Threshold) { - Count >>= 1; - Size = LoopSize*Count; - } - } - if (Count < 2) { - DEBUG(dbgs() << " could not unroll partially\n"); - return false; + // Reduce unroll count to be the largest power-of-two factor of + // the original count which satisfies the threshold limit. + while (Count != 0 && UnrolledSize > PartialThreshold) { + Count >>= 1; + UnrolledSize = LoopSize * Count; + } + if (Count > UP.MaxCount) + Count = UP.MaxCount; + DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } + + if (HasPragma) { + // Mark loop as unrolled to prevent unrolling beyond that + // requested by the pragma. + SetLoopAlreadyUnrolled(L); + + // Emit optimization remarks if we are unable to unroll the loop + // as directed by a pragma. + DebugLoc LoopLoc = L->getStartLoc(); + Function *F = Header->getParent(); + LLVMContext &Ctx = F->getContext(); + if (HasEnablePragma && PragmaCount == 0) { + if (TripCount && Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(enable) pragma " + "because unrolled size is too large."); + } else if (!TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(enable) pragma " + "because loop has a runtime trip count."); } - DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } else if (PragmaCount > 0 && Count != OriginalCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop the number of times directed by " + "unroll_count pragma because unrolled size is too large."); } } + if (Unrolling != Full && Count < 2) { + // Partial unrolling by 1 is a nop. For full unrolling, a factor + // of 1 makes sense because loop control can be eliminated. + return false; + } + // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index c4ebfd5f413d..977c53a3bc63 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -26,13 +26,11 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-unswitch" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -40,6 +38,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" @@ -53,6 +52,8 @@ #include <set> using namespace llvm; +#define DEBUG_TYPE "loop-unswitch" + STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); STATISTIC(NumSelects , "Number of selects unswitched"); @@ -96,7 +97,7 @@ namespace { public: LUAnalysisCache() : - CurLoopInstructions(0), CurrentLoopProperties(0), + CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr), MaxSize(Threshold) {} @@ -151,44 +152,35 @@ namespace { static char ID; // Pass ID, replacement for typeid explicit LoopUnswitch(bool Os = false) : LoopPass(ID), OptimizeForSize(Os), redoLoop(false), - currentLoop(0), DT(0), loopHeader(0), - loopPreheader(0) { + currentLoop(nullptr), DT(nullptr), loopHeader(nullptr), + loopPreheader(nullptr) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool processCurrentLoop(); /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG. /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); AU.addRequired<TargetTransformInfo>(); } private: - virtual void releaseMemory() { + void releaseMemory() override { BranchesInfo.forgetLoop(currentLoop); } - /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist, - /// remove it. - void RemoveLoopFromWorklist(Loop *L) { - std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(), - LoopProcessWorklist.end(), L); - if (I != LoopProcessWorklist.end()) - LoopProcessWorklist.erase(I); - } - void initLoopData() { loopHeader = currentLoop->getHeader(); loopPreheader = currentLoop->getLoopPreheader(); @@ -212,9 +204,8 @@ namespace { Instruction *InsertPt); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - void RemoveLoopFromHierarchy(Loop *L); - bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, - BasicBlock **LoopExit = 0); + bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr, + BasicBlock **LoopExit = nullptr); }; } @@ -225,7 +216,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { LoopPropsMapIt PropsIt; bool Inserted; - llvm::tie(PropsIt, Inserted) = + std::tie(PropsIt, Inserted) = LoopsProperties.insert(std::make_pair(L, LoopProperties())); LoopProperties &Props = PropsIt->second; @@ -283,8 +274,8 @@ void LUAnalysisCache::forgetLoop(const Loop *L) { LoopsProperties.erase(LIt); } - CurrentLoopProperties = 0; - CurLoopInstructions = 0; + CurrentLoopProperties = nullptr; + CurLoopInstructions = nullptr; } // Mark case value as unswitched. @@ -355,10 +346,10 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // We can never unswitch on vector conditions. if (Cond->getType()->isVectorTy()) - return 0; + return nullptr; // Constants should be folded, not unswitched on! - if (isa<Constant>(Cond)) return 0; + if (isa<Constant>(Cond)) return nullptr; // TODO: Handle: br (VARIANT|INVARIANT). @@ -378,13 +369,18 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { return RHS; } - return 0; + return nullptr; } bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { + if (skipOptnoneFunction(L)) + return false; + LI = &getAnalysis<LoopInfo>(); LPM = &LPM_Ref; - DT = getAnalysisIfAvailable<DominatorTree>(); + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; currentLoop = L; Function *F = currentLoop->getHeader()->getParent(); bool Changed = false; @@ -397,7 +393,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { if (Changed) { // FIXME: Reconstruct dom info, because it is not preserved properly. if (DT) - DT->runOnFunction(*F); + DT->recalculate(*F); } return Changed; } @@ -456,7 +452,7 @@ bool LoopUnswitch::processCurrentLoop() { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! // FIXME: scan for a case with a non-critical edge? - Constant *UnswitchVal = 0; + Constant *UnswitchVal = nullptr; // Do not process same value again and again. // At this point we have some cases already unswitched and @@ -513,7 +509,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, if (!L->contains(BB)) { // Otherwise, this is a loop exit, this is fine so long as this is the // first exit. - if (ExitBB != 0) return false; + if (ExitBB) return false; ExitBB = BB; return true; } @@ -540,10 +536,10 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { std::set<BasicBlock*> Visited; Visited.insert(L->getHeader()); // Branches to header make infinite loops. - BasicBlock *ExitBB = 0; + BasicBlock *ExitBB = nullptr; if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited)) return ExitBB; - return 0; + return nullptr; } /// IsTrivialUnswitchCondition - Check to see if this unswitch condition is @@ -564,7 +560,7 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, TerminatorInst *HeaderTerm = Header->getTerminator(); LLVMContext &Context = Header->getContext(); - BasicBlock *LoopExitBB = 0; + BasicBlock *LoopExitBB = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { // If the header block doesn't end with a conditional branch on Cond, we // can't handle it. @@ -634,8 +630,8 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { Function *F = loopHeader->getParent(); - Constant *CondVal = 0; - BasicBlock *ExitBlock = 0; + Constant *CondVal = nullptr; + BasicBlock *ExitBlock = nullptr; if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { // If the condition is trivial, always unswitch. There is no code growth @@ -934,9 +930,8 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, Worklist.push_back(Use); // Add users to the worklist which may be simplified now. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - Worklist.push_back(cast<Instruction>(*UI)); + for (User *U : I->users()) + Worklist.push_back(cast<Instruction>(U)); LPM->deleteSimpleAnalysisValue(I, L); RemoveFromWorklist(I, Worklist); I->replaceAllUsesWith(V); @@ -944,17 +939,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -/// RemoveLoopFromHierarchy - We have discovered that the specified loop has -/// become unwrapped, either because the backedge was deleted, or because the -/// edge into the header was removed. If the edge into the header from the -/// latch block was removed, the loop is unwrapped but subloops are still alive, -/// so they just reparent loops. If the loops are actually dead, they will be -/// removed later. -void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) { - LPM->deleteLoopFromQueue(L); - RemoveLoopFromWorklist(L); -} - // RewriteLoopBodyWithConditionConstant - We know either that the value LIC has // the value specified by Val in the specified loop, or we know it does NOT have // that value. Rewrite any uses of LIC or of properties correlated to it. @@ -986,12 +970,11 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), !cast<ConstantInt>(Val)->getZExtValue()); - for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); - UI != E; ++UI) { - Instruction *U = dyn_cast<Instruction>(*UI); - if (!U || !L->contains(U)) + for (User *U : LIC->users()) { + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || !L->contains(UI)) continue; - Worklist.push_back(U); + Worklist.push_back(UI); } for (std::vector<Instruction*>::iterator UI = Worklist.begin(), @@ -1005,20 +988,19 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // Otherwise, we don't know the precise value of LIC, but we do know that it // is certainly NOT "Val". As such, simplify any uses in the loop that we // can. This case occurs when we unswitch switch statements. - for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); - UI != E; ++UI) { - Instruction *U = dyn_cast<Instruction>(*UI); - if (!U || !L->contains(U)) + for (User *U : LIC->users()) { + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || !L->contains(UI)) continue; - Worklist.push_back(U); + Worklist.push_back(UI); // TODO: We could do other simplifications, for example, turning // 'icmp eq LIC, Val' -> false. // If we know that LIC is not Val, use this info to simplify code. - SwitchInst *SI = dyn_cast<SwitchInst>(U); - if (SI == 0 || !isa<ConstantInt>(Val)) continue; + SwitchInst *SI = dyn_cast<SwitchInst>(UI); + if (!SI || !isa<ConstantInt>(Val)) continue; SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val)); // Default case is live for multiple values. diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 8ced4946c832..3314e1ed41ab 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +19,8 @@ #include "llvm/Pass.h" using namespace llvm; +#define DEBUG_TYPE "loweratomic" + static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { IRBuilder<> Builder(CXI->getParent(), CXI); Value *Ptr = CXI->getPointerOperand(); @@ -31,7 +32,10 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Res = Builder.CreateSelect(Equal, Val, Orig); Builder.CreateStore(Res, Ptr); - CXI->replaceAllUsesWith(Orig); + Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0); + Res = Builder.CreateInsertValue(Res, Equal, 1); + + CXI->replaceAllUsesWith(Res); CXI->eraseFromParent(); return true; } @@ -42,7 +46,7 @@ static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { Value *Val = RMWI->getValOperand(); LoadInst *Orig = Builder.CreateLoad(Ptr); - Value *Res = NULL; + Value *Res = nullptr; switch (RMWI->getOperation()) { default: llvm_unreachable("Unexpected RMW operation"); @@ -111,7 +115,9 @@ namespace { LowerAtomic() : BasicBlockPass(ID) { initializeLowerAtomicPass(*PassRegistry::getPassRegistry()); } - bool runOnBasicBlock(BasicBlock &BB) { + bool runOnBasicBlock(BasicBlock &BB) override { + if (skipOptnoneFunction(BB)) + return false; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { Instruction *Inst = DI++; diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9912d3dafed3..7c184a4ad2c3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -12,27 +12,28 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; +#define DEBUG_TYPE "memcpyopt" + STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); @@ -49,7 +50,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, int64_t Offset = 0; for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (OpC == 0) + if (!OpC) return VariableIdxFound = true; if (OpC->isZero()) continue; // No offset. @@ -75,6 +76,13 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, const DataLayout &TD) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); + + // Handle the trivial case first. + if (Ptr1 == Ptr2) { + Offset = 0; + return true; + } + GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); @@ -82,12 +90,12 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, // If one pointer is a GEP and the other isn't, then see if the GEP is a // constant offset from the base, as in "P" and "gep P, 1". - if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { + if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); return !VariableIdxFound; } - if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { + if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); return !VariableIdxFound; } @@ -195,9 +203,9 @@ class MemsetRanges { /// because each element is relatively large and expensive to copy. std::list<MemsetRange> Ranges; typedef std::list<MemsetRange>::iterator range_iterator; - const DataLayout &TD; + const DataLayout &DL; public: - MemsetRanges(const DataLayout &td) : TD(td) {} + MemsetRanges(const DataLayout &DL) : DL(DL) {} typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } @@ -212,7 +220,7 @@ public: } void addStore(int64_t OffsetFromFirst, StoreInst *SI) { - int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); + int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), SI->getAlignment(), SI); @@ -305,23 +313,23 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const DataLayout *TD; + const DataLayout *DL; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); - MD = 0; - TLI = 0; - TD = 0; + MD = nullptr; + TLI = nullptr; + DL = nullptr; } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; private: // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetLibraryInfo>(); @@ -353,7 +361,7 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) @@ -366,13 +374,13 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", /// attempts to merge them together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { - if (TD == 0) return 0; + if (!DL) return nullptr; // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. - MemsetRanges Ranges(*TD); + MemsetRanges Ranges(*DL); BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { @@ -396,7 +404,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), - Offset, *TD)) + Offset, *DL)) break; Ranges.addStore(Offset, NextStore); @@ -409,7 +417,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; - if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL)) break; Ranges.addMemSet(Offset, MSI); @@ -419,7 +427,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) - return 0; + return nullptr; // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something @@ -433,7 +441,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. - Instruction *AMemSet = 0; + Instruction *AMemSet = nullptr; for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; @@ -441,7 +449,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. - if (!Range.isProfitableToUseMemset(*TD)) + if (!Range.isProfitableToUseMemset(*DL)) continue; // Otherwise, we do want to transform this! Create a new memset. @@ -453,7 +461,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Alignment == 0) { Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = TD->getABITypeAlignment(EltType); + Alignment = DL->getABITypeAlignment(EltType); } AMemSet = @@ -484,7 +492,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; - if (TD == 0) return false; + if (!DL) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than @@ -493,7 +501,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); - CallInst *C = 0; + CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); @@ -505,7 +513,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { for (BasicBlock::iterator I = --BasicBlock::iterator(SI), E = C; I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { - C = 0; + C = nullptr; break; } } @@ -514,15 +522,15 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) - storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); + storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) - loadAlign = TD->getABITypeAlignment(LI->getType()); + loadAlign = DL->getABITypeAlignment(LI->getType()); bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), - TD->getTypeStoreSize(SI->getOperand(0)->getType()), + DL->getTypeStoreSize(SI->getOperand(0)->getType()), std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); @@ -596,13 +604,13 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; // Check that all of src is copied to dest. - if (TD == 0) return false; + if (!DL) return false; ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); if (!srcArraySize) return false; - uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) * + uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) * srcArraySize->getZExtValue(); if (cpyLen < srcSize) @@ -617,7 +625,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!destArraySize) return false; - uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) * + uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) * destArraySize->getZExtValue(); if (destSize < srcSize) @@ -636,7 +644,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } - uint64_t destSize = TD->getTypeAllocSize(StructTy); + uint64_t destSize = DL->getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } else { @@ -646,7 +654,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Check that dest points to memory that is at least as aligned as src. unsigned srcAlign = srcAlloca->getAlignment(); if (!srcAlign) - srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType()); + srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType()); bool isDestSufficientlyAligned = srcAlign <= cpyAlign; // If dest is not aligned enough and we can't increase its alignment then // bail out. @@ -657,30 +665,34 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // guarantees that it holds only undefined values when passed in (so the final // memcpy can be dropped), that it is not read or written between the call and // the memcpy, and that writing beyond the end of it is undefined. - SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(), - srcAlloca->use_end()); + SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(), + srcAlloca->user_end()); while (!srcUseList.empty()) { - User *UI = srcUseList.pop_back_val(); + User *U = srcUseList.pop_back_val(); - if (isa<BitCastInst>(UI)) { - for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); - I != E; ++I) - srcUseList.push_back(*I); - } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) { + if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { + for (User *UU : U->users()) + srcUseList.push_back(UU); + } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { if (G->hasAllZeroIndices()) - for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); - I != E; ++I) - srcUseList.push_back(*I); + for (User *UU : U->users()) + srcUseList.push_back(UU); else return false; - } else if (UI != C && UI != cpy) { + } else if (U != C && U != cpy) { return false; } } + // Check that src isn't captured by the called function since the + // transformation can cause aliasing issues in that case. + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i)) + return false; + // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. - DominatorTree &DT = getAnalysis<DominatorTree>(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) if (!DT.dominates(cpyDestInst, C)) return false; @@ -816,9 +828,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { - // We can only optimize statically-sized memcpy's that are non-volatile. - ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); - if (CopySize == 0 || M->isVolatile()) return false; + // We can only optimize non-volatile memcpy's. + if (M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { @@ -832,7 +843,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); - Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, + Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); @@ -840,9 +851,16 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return true; } - // The are two possible optimizations we can do for memcpy: + // The optimizations after this point require the memcpy size. + ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); + if (!CopySize) return false; + + // The are three possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. + // c) memcpy from freshly alloca'd space or space that has just started its + // lifetime copies undefined data, and we can therefore eliminate the + // memcpy in favor of the data that was already at the destination. MemDepResult DepInfo = MD->getDependency(M); if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { @@ -862,6 +880,25 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); + } else if (SrcDepInfo.isDef()) { + Instruction *I = SrcDepInfo.getInst(); + bool hasUndefContents = false; + + if (isa<AllocaInst>(I)) { + hasUndefContents = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) + if (LTSize->getZExtValue() >= CopySize->getZExtValue()) + hasUndefContents = true; + } + + if (hasUndefContents) { + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumMemCpyInstr; + return true; + } } return false; @@ -899,12 +936,12 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { /// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { - if (TD == 0) return false; + if (!DL) return false; // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); - uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); + uint64_t ByValSize = DL->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), @@ -916,13 +953,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // a memcpy, see if we can byval from the source of the memcpy instead of the // result. MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); - if (MDep == 0 || MDep->isVolatile() || + if (!MDep || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); - if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) + if (!C1 || C1->getValue().getZExtValue() < ByValSize) return false; // Get the alignment of the byval. If the call doesn't specify the alignment, @@ -933,7 +970,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and @@ -1007,9 +1044,13 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { // function. // bool MemCpyOpt::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); // If we don't have at least memset and memcpy, there is little point of doing @@ -1024,6 +1065,6 @@ bool MemCpyOpt::runOnFunction(Function &F) { MadeChange = true; } - MD = 0; + MD = nullptr; return MadeChange; } diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp new file mode 100644 index 000000000000..a7e80240d9e7 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -0,0 +1,632 @@ +//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//! \file +//! \brief This pass performs merges of loads and stores on both sides of a +// diamond (hammock). It hoists the loads and sinks the stores. +// +// The algorithm iteratively hoists two loads to the same address out of a +// diamond (hammock) and merges them into a single load in the header. Similar +// it sinks and merges two stores to the tail block (footer). The algorithm +// iterates over the instructions of one side of the diamond and attempts to +// find a matching load/store on the other side. It hoists / sinks when it +// thinks it safe to do so. This optimization helps with eg. hiding load +// latencies, triggering if-conversion, and reducing static code size. +// +//===----------------------------------------------------------------------===// +// +// +// Example: +// Diamond shaped code before merge: +// +// header: +// br %cond, label %if.then, label %if.else +// / \ +// / \ +// / \ +// if.then: if.else: +// %lt = load %addr_l %le = load %addr_l +// <use %lt> <use %le> +// <...> <...> +// store %st, %addr_s store %se, %addr_s +// br label %if.end br label %if.end +// \ / +// \ / +// \ / +// if.end ("footer"): +// <...> +// +// Diamond shaped code after merge: +// +// header: +// %l = load %addr_l +// br %cond, label %if.then, label %if.else +// / \ +// / \ +// / \ +// if.then: if.else: +// <use %l> <use %l> +// <...> <...> +// br label %if.end br label %if.end +// \ / +// \ / +// \ / +// if.end ("footer"): +// %s.sink = phi [%st, if.then], [%se, if.else] +// <...> +// store %s.sink, %addr_s +// <...> +// +// +//===----------------------- TODO -----------------------------------------===// +// +// 1) Generalize to regions other than diamonds +// 2) Be more aggressive merging memory operations +// Note that both changes require register pressure control +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include <vector> +using namespace llvm; + +#define DEBUG_TYPE "mldst-motion" + +//===----------------------------------------------------------------------===// +// MergedLoadStoreMotion Pass +//===----------------------------------------------------------------------===// +static cl::opt<bool> +EnableMLSM("mlsm", cl::desc("Enable motion of merged load and store"), + cl::init(true)); + +namespace { +class MergedLoadStoreMotion : public FunctionPass { + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + +public: + static char ID; // Pass identification, replacement for typeid + explicit MergedLoadStoreMotion(void) + : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { + initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + +private: + // This transformation requires dominator postdominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + } + + // Helper routines + + /// + /// \brief Remove instruction from parent and update memory dependence + /// analysis. + /// + void removeInstruction(Instruction *Inst); + BasicBlock *getDiamondTail(BasicBlock *BB); + bool isDiamondHead(BasicBlock *BB); + // Routines for hoisting loads + bool isLoadHoistBarrier(Instruction *Inst); + LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); + void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, + Instruction *ElseInst); + bool isSafeToHoist(Instruction *I) const; + bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst); + bool mergeLoads(BasicBlock *BB); + // Routines for sinking stores + StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); + PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); + bool isStoreSinkBarrier(Instruction *Inst); + bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); + bool mergeStores(BasicBlock *BB); + // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, + // where Size0 and Size1 are the #instructions on the two sides of + // the diamond. The constant chosen here is arbitrary. Compiler Time + // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. + const int MagicCompileTimeControl; +}; + +char MergedLoadStoreMotion::ID = 0; +} + +/// +/// \brief createMergedLoadStoreMotionPass - The public interface to this file. +/// +FunctionPass *llvm::createMergedLoadStoreMotionPass() { + return new MergedLoadStoreMotion(); +} + +INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", + "MergedLoadStoreMotion", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", + "MergedLoadStoreMotion", false, false) + +/// +/// \brief Remove instruction from parent and update memory dependence analysis. +/// +void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { + // Notify the memory dependence analysis. + if (MD) { + MD->removeInstruction(Inst); + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + MD->invalidateCachedPointerInfo(LI->getPointerOperand()); + if (Inst->getType()->getScalarType()->isPointerTy()) { + MD->invalidateCachedPointerInfo(Inst); + } + } + Inst->eraseFromParent(); +} + +/// +/// \brief Return tail block of a diamond. +/// +BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { + assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); + BranchInst *BI = (BranchInst *)(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + return Tail; +} + +/// +/// \brief True when BB is the head of a diamond (hammock) +/// +bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { + if (!BB) + return false; + if (!isa<BranchInst>(BB->getTerminator())) + return false; + if (BB->getTerminator()->getNumSuccessors() != 2) + return false; + + BranchInst *BI = (BranchInst *)(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + + if (!Succ0->getSinglePredecessor() || + Succ0->getTerminator()->getNumSuccessors() != 1) + return false; + if (!Succ1->getSinglePredecessor() || + Succ1->getTerminator()->getNumSuccessors() != 1) + return false; + + BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + // Ignore triangles. + if (Succ1->getTerminator()->getSuccessor(0) != Tail) + return false; + return true; +} + +/// +/// \brief True when instruction is a hoist barrier for a load +/// +/// Whenever an instruction could possibly modify the value +/// being loaded or protect against the load from happening +/// it is considered a hoist barrier. +/// +bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) { + // FIXME: A call with no side effects should not be a barrier. + // Aren't all such calls covered by mayHaveSideEffects() below? + // Then this check can be removed. + if (isa<CallInst>(Inst)) + return true; + if (isa<TerminatorInst>(Inst)) + return true; + // FIXME: Conservatively let a store instruction block the load. + // Use alias analysis instead. + if (isa<StoreInst>(Inst)) + return true; + // Note: mayHaveSideEffects covers all instructions that could + // trigger a change to state. Eg. in-flight stores have to be executed + // before ordered loads or fences, calls could invoke functions that store + // data to memory etc. + if (Inst->mayHaveSideEffects()) { + return true; + } + DEBUG(dbgs() << "No Hoist Barrier\n"); + return false; +} + +/// +/// \brief Decide if a load can be hoisted +/// +/// When there is a load in \p BB to the same address as \p LI +/// and it can be hoisted from \p BB, return that load. +/// Otherwise return Null. +/// +LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB, + LoadInst *LI) { + LoadInst *I = nullptr; + assert(isa<LoadInst>(LI)); + if (LI->isUsedOutsideOfBlock(LI->getParent())) + return nullptr; + + for (BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); BBI != BBE; + ++BBI) { + Instruction *Inst = BBI; + + // Only merge and hoist loads when their result in used only in BB + if (isLoadHoistBarrier(Inst)) + break; + if (!isa<LoadInst>(Inst)) + continue; + if (Inst->isUsedOutsideOfBlock(Inst->getParent())) + continue; + + AliasAnalysis::Location LocLI = AA->getLocation(LI); + AliasAnalysis::Location LocInst = AA->getLocation((LoadInst *)Inst); + if (AA->isMustAlias(LocLI, LocInst) && LI->getType() == Inst->getType()) { + I = (LoadInst *)Inst; + break; + } + } + return I; +} + +/// +/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into +/// \p BB +/// +/// BB is the head of a diamond +/// +void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, + Instruction *HoistCand, + Instruction *ElseInst) { + DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n"); + // Hoist the instruction. + assert(HoistCand->getParent() != BB); + + // Intersect optional metadata. + HoistCand->intersectOptionalDataWith(ElseInst); + HoistCand->dropUnknownMetadata(); + + // Prepend point for instruction insert + Instruction *HoistPt = BB->getTerminator(); + + // Merged instruction + Instruction *HoistedInst = HoistCand->clone(); + + // Notify AA of the new value. + if (isa<LoadInst>(HoistCand)) + AA->copyValue(HoistCand, HoistedInst); + + // Hoist instruction. + HoistedInst->insertBefore(HoistPt); + + HoistCand->replaceAllUsesWith(HoistedInst); + removeInstruction(HoistCand); + // Replace the else block instruction. + ElseInst->replaceAllUsesWith(HoistedInst); + removeInstruction(ElseInst); +} + +/// +/// \brief Return true if no operand of \p I is defined in I's parent block +/// +bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { + BasicBlock *Parent = I->getParent(); + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i)); + if (Instr && Instr->getParent() == Parent) + return false; + } + return true; +} + +/// +/// \brief Merge two equivalent loads and GEPs and hoist into diamond head +/// +bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, + LoadInst *L1) { + // Only one definition? + Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand()); + Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand()); + if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) && + A0->hasOneUse() && (A0->getParent() == L0->getParent()) && + A1->hasOneUse() && (A1->getParent() == L1->getParent()) && + isa<GetElementPtrInst>(A0)) { + DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n"); + hoistInstruction(BB, A0, A1); + hoistInstruction(BB, L0, L1); + return true; + } else + return false; +} + +/// +/// \brief Try to hoist two loads to same address into diamond header +/// +/// Starting from a diamond head block, iterate over the instructions in one +/// successor block and try to match a load in the second successor. +/// +bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { + bool MergedLoads = false; + assert(isDiamondHead(BB)); + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + // #Instructions in Succ1 for Compile Time Control + int Size1 = Succ1->size(); + int NLoads = 0; + for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); + BBI != BBE;) { + + Instruction *I = BBI; + ++BBI; + if (isLoadHoistBarrier(I)) + break; + + // Only move non-simple (atomic, volatile) loads. + if (!isa<LoadInst>(I)) + continue; + + LoadInst *L0 = (LoadInst *)I; + if (!L0->isSimple()) + continue; + + ++NLoads; + if (NLoads * Size1 >= MagicCompileTimeControl) + break; + if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) { + bool Res = hoistLoad(BB, L0, L1); + MergedLoads |= Res; + // Don't attempt to hoist above loads that had not been hoisted. + if (!Res) + break; + } + } + return MergedLoads; +} + +/// +/// \brief True when instruction is sink barrier for a store +/// +bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { + // FIXME: Conservatively let a load instruction block the store. + // Use alias analysis instead. + if (isa<LoadInst>(Inst)) + return true; + if (isa<CallInst>(Inst)) + return true; + if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) + return true; + // Note: mayHaveSideEffects covers all instructions that could + // trigger a change to state. Eg. in-flight stores have to be executed + // before ordered loads or fences, calls could invoke functions that store + // data to memory etc. + if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) { + return true; + } + DEBUG(dbgs() << "No Sink Barrier\n"); + return false; +} + +/// +/// \brief Check if \p BB contains a store to the same address as \p SI +/// +/// \return The store in \p when it is safe to sink. Otherwise return Null. +/// +StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB, + StoreInst *SI) { + StoreInst *I = 0; + DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n"); + for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend(); + RBI != RBE; ++RBI) { + Instruction *Inst = &*RBI; + + // Only move loads if they are used in the block. + if (isStoreSinkBarrier(Inst)) + break; + if (isa<StoreInst>(Inst)) { + AliasAnalysis::Location LocSI = AA->getLocation(SI); + AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst); + if (AA->isMustAlias(LocSI, LocInst)) { + I = (StoreInst *)Inst; + break; + } + } + } + return I; +} + +/// +/// \brief Create a PHI node in BB for the operands of S0 and S1 +/// +PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { + // Create a phi if the values mismatch. + PHINode *NewPN = 0; + Value *Opd1 = S0->getValueOperand(); + Value *Opd2 = S1->getValueOperand(); + if (Opd1 != Opd2) { + NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", + BB->begin()); + NewPN->addIncoming(Opd1, S0->getParent()); + NewPN->addIncoming(Opd2, S1->getParent()); + if (NewPN->getType()->getScalarType()->isPointerTy()) { + // Notify AA of the new value. + AA->copyValue(Opd1, NewPN); + AA->copyValue(Opd2, NewPN); + // AA needs to be informed when a PHI-use of the pointer value is added + for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { + unsigned J = PHINode::getOperandNumForIncomingValue(I); + AA->addEscapingUse(NewPN->getOperandUse(J)); + } + if (MD) + MD->invalidateCachedPointerInfo(NewPN); + } + } + return NewPN; +} + +/// +/// \brief Merge two stores to same address and sink into \p BB +/// +/// Also sinks GEP instruction computing the store address +/// +bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { + // Only one definition? + Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); + Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); + if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && + (A0->getParent() == S0->getParent()) && A1->hasOneUse() && + (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) { + DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); + // Hoist the instruction. + BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); + // Intersect optional metadata. + S0->intersectOptionalDataWith(S1); + S0->dropUnknownMetadata(); + + // Create the new store to be inserted at the join point. + StoreInst *SNew = (StoreInst *)(S0->clone()); + Instruction *ANew = A0->clone(); + AA->copyValue(S0, SNew); + SNew->insertBefore(InsertPt); + ANew->insertBefore(SNew); + + assert(S0->getParent() == A0->getParent()); + assert(S1->getParent() == A1->getParent()); + + PHINode *NewPN = getPHIOperand(BB, S0, S1); + // New PHI operand? Use it. + if (NewPN) + SNew->setOperand(0, NewPN); + removeInstruction(S0); + removeInstruction(S1); + A0->replaceAllUsesWith(ANew); + removeInstruction(A0); + A1->replaceAllUsesWith(ANew); + removeInstruction(A1); + return true; + } + return false; +} + +/// +/// \brief True when two stores are equivalent and can sink into the footer +/// +/// Starting from a diamond tail block, iterate over the instructions in one +/// predecessor block and try to match a store in the second predecessor. +/// +bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { + + bool MergedStores = false; + assert(T && "Footer of a diamond cannot be empty"); + + pred_iterator PI = pred_begin(T), E = pred_end(T); + assert(PI != E); + BasicBlock *Pred0 = *PI; + ++PI; + BasicBlock *Pred1 = *PI; + ++PI; + // tail block of a diamond/hammock? + if (Pred0 == Pred1) + return false; // No. + if (PI != E) + return false; // No. More than 2 predecessors. + + // #Instructions in Succ1 for Compile Time Control + int Size1 = Pred1->size(); + int NStores = 0; + + for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend(); + RBI != RBE;) { + + Instruction *I = &*RBI; + ++RBI; + if (isStoreSinkBarrier(I)) + break; + // Sink move non-simple (atomic, volatile) stores + if (!isa<StoreInst>(I)) + continue; + StoreInst *S0 = (StoreInst *)I; + if (!S0->isSimple()) + continue; + + ++NStores; + if (NStores * Size1 >= MagicCompileTimeControl) + break; + if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { + bool Res = sinkStore(T, S0, S1); + MergedStores |= Res; + // Don't attempt to sink below stores that had to stick around + // But after removal of a store and some of its feeding + // instruction search again from the beginning since the iterator + // is likely stale at this point. + if (!Res) + break; + else { + RBI = Pred0->rbegin(); + RBE = Pred0->rend(); + DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); + } + } + } + return MergedStores; +} +/// +/// \brief Run the transformation for each function +/// +bool MergedLoadStoreMotion::runOnFunction(Function &F) { + MD = &getAnalysis<MemoryDependenceAnalysis>(); + AA = &getAnalysis<AliasAnalysis>(); + + bool Changed = false; + if (!EnableMLSM) + return false; + DEBUG(dbgs() << "Instruction Merger\n"); + + // Merge unconditional branches, allowing PRE to catch more + // optimization opportunities. + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { + BasicBlock *BB = FI++; + + // Hoist equivalent loads and sink stores + // outside diamonds when possible + // Run outside core GVN + if (isDiamondHead(BB)) { + Changed |= mergeLoads(BB); + Changed |= mergeStores(getDiamondTail(BB)); + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 15cee44e13dd..7cce89e0627e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "partially-inline-libcalls" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Intrinsics.h" @@ -25,6 +24,8 @@ using namespace llvm; +#define DEBUG_TYPE "partially-inline-libcalls" + namespace { class PartiallyInlineLibCalls : public FunctionPass { public: @@ -35,8 +36,8 @@ namespace { initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry()); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; private: /// Optimize calls to sqrt. diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index 328a9c5755b3..ea2cf7cf9b5f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -20,29 +20,29 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "reassociate" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Assembly/Writer.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "reassociate" + STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); @@ -67,7 +67,7 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { << *Ops[0].Op->getType() << '\t'; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { dbgs() << "[ "; - WriteAsOperand(dbgs(), Ops[i].Op, false, M); + Ops[i].Op->printAsOperand(dbgs(), false, M); dbgs() << ", #" << Ops[i].Rank << "] "; } } @@ -123,14 +123,14 @@ namespace { public: XorOpnd(Value *V); - bool isInvalid() const { return SymbolicPart == 0; } + bool isInvalid() const { return SymbolicPart == nullptr; } bool isOrExpr() const { return isOr; } Value *getValue() const { return OrigVal; } Value *getSymbolicPart() const { return SymbolicPart; } unsigned getSymbolicRank() const { return SymbolicRank; } const APInt &getConstPart() const { return ConstPart; } - void Invalidate() { SymbolicPart = OrigVal = 0; } + void Invalidate() { SymbolicPart = OrigVal = nullptr; } void setSymbolicRank(unsigned R) { SymbolicRank = R; } // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank. @@ -168,9 +168,9 @@ namespace { initializeReassociatePass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } private: @@ -237,7 +237,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { if (V->hasOneUse() && isa<Instruction>(V) && cast<Instruction>(V)->getOpcode() == Opcode) return cast<BinaryOperator>(V); - return 0; + return nullptr; } static bool isUnmovableInstruction(Instruction *I) { @@ -285,7 +285,7 @@ void Reassociate::BuildRankMap(Function &F) { unsigned Reassociate::getRank(Value *V) { Instruction *I = dyn_cast<Instruction>(V); - if (I == 0) { + if (!I) { if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument. return 0; // Otherwise it's a global or constant, rank 0. } @@ -706,7 +706,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // ExpressionChanged - Non-null if the rewritten expression differs from the // original in some non-trivial way, requiring the clearing of optional flags. // Flags are cleared from the operator in ExpressionChanged up to I inclusive. - BinaryOperator *ExpressionChanged = 0; + BinaryOperator *ExpressionChanged = nullptr; for (unsigned i = 0; ; ++i) { // The last operation (which comes earliest in the IR) is special as both // operands will come from Ops, rather than just one with the other being @@ -821,7 +821,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, if (ExpressionChanged == I) break; ExpressionChanged->moveBefore(I); - ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin()); + ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin()); } while (1); // Throw away any left over nodes from the original expression. @@ -863,8 +863,7 @@ static Value *NegateValue(Value *V, Instruction *BI) { // Okay, we need to materialize a negated version of V with an instruction. // Scan the use lists of V to see if we have one already. - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ - User *U = *UI; + for (User *U : V->users()) { if (!BinaryOperator::isNeg(U)) continue; // We found one! Now we have to make sure that the definition dominates @@ -914,8 +913,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { isReassociableOp(Sub->getOperand(1), Instruction::Sub)) return true; if (Sub->hasOneUse() && - (isReassociableOp(Sub->use_back(), Instruction::Add) || - isReassociableOp(Sub->use_back(), Instruction::Sub))) + (isReassociableOp(Sub->user_back(), Instruction::Add) || + isReassociableOp(Sub->user_back(), Instruction::Sub))) return true; return false; @@ -997,7 +996,7 @@ static Value *EmitAddTreeOfValues(Instruction *I, /// remove Factor from the tree and return the new tree. Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); - if (!BO) return 0; + if (!BO) return nullptr; SmallVector<RepeatedValue, 8> Tree; MadeChange |= LinearizeExprTree(BO, Tree); @@ -1031,7 +1030,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { if (!FoundFactor) { // Make sure to restore the operands to the expression tree. RewriteExprTree(BO, Factors); - return 0; + return nullptr; } BasicBlock::iterator InsertPt = BO; ++InsertPt; @@ -1116,7 +1115,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode, ++NumAnnihil; } } - return 0; + return nullptr; } /// Helper funciton of CombineXorOpnd(). It creates a bitwise-and @@ -1137,7 +1136,7 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, } return Opnd; } - return 0; + return nullptr; } // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" @@ -1263,7 +1262,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, return V; if (Ops.size() == 1) - return 0; + return nullptr; SmallVector<XorOpnd, 8> Opnds; SmallVector<XorOpnd*, 8> OpndPtrs; @@ -1293,10 +1292,10 @@ Value *Reassociate::OptimizeXor(Instruction *I, // the same symbolic value cluster together. For instance, the input operand // sequence ("x | 123", "y & 456", "x & 789") will be sorted into: // ("x | 123", "x & 789", "y & 456"). - std::sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor()); + std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor()); // Step 3: Combine adjacent operands - XorOpnd *PrevOpnd = 0; + XorOpnd *PrevOpnd = nullptr; bool Changed = false; for (unsigned i = 0, e = Opnds.size(); i < e; i++) { XorOpnd *CurrOpnd = OpndPtrs[i]; @@ -1330,7 +1329,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, PrevOpnd = CurrOpnd; } else { CurrOpnd->Invalidate(); - PrevOpnd = 0; + PrevOpnd = nullptr; } Changed = true; } @@ -1360,7 +1359,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, } } - return 0; + return nullptr; } /// OptimizeAdd - Optimize a series of operands to an 'add' instruction. This @@ -1369,11 +1368,10 @@ Value *Reassociate::OptimizeXor(Instruction *I, Value *Reassociate::OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops) { // Scan the operand lists looking for X and -X pairs. If we find any, we - // can simplify the expression. X+-X == 0. While we're at it, scan for any + // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it, + // scan for any // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. - // - // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1". - // + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *TheOp = Ops[i].Op; // Check to see if we've seen this operand before. If so, we factor all @@ -1413,19 +1411,28 @@ Value *Reassociate::OptimizeAdd(Instruction *I, continue; } - // Check for X and -X in the operand list. - if (!BinaryOperator::isNeg(TheOp)) + // Check for X and -X or X and ~X in the operand list. + if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp)) continue; - Value *X = BinaryOperator::getNegArgument(TheOp); + Value *X = nullptr; + if (BinaryOperator::isNeg(TheOp)) + X = BinaryOperator::getNegArgument(TheOp); + else if (BinaryOperator::isNot(TheOp)) + X = BinaryOperator::getNotArgument(TheOp); + unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX == i) continue; // Remove X and -X from the operand list. - if (Ops.size() == 2) + if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp)) return Constant::getNullValue(X->getType()); + // Remove X and ~X from the operand list. + if (Ops.size() == 2 && BinaryOperator::isNot(TheOp)) + return Constant::getAllOnesValue(X->getType()); + Ops.erase(Ops.begin()+i); if (i < FoundX) --FoundX; @@ -1435,6 +1442,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I, ++NumAnnihil; --i; // Revisit element. e -= 2; // Removed two elements. + + // if X and ~X we append -1 to the operand list. + if (BinaryOperator::isNot(TheOp)) { + Value *V = Constant::getAllOnesValue(X->getType()); + Ops.insert(Ops.end(), ValueEntry(getRank(V), V)); + e += 1; + } } // Scan the operand list, checking to see if there are any common factors @@ -1447,7 +1461,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) // where they are actually the same multiply. unsigned MaxOcc = 0; - Value *MaxOccVal = 0; + Value *MaxOccVal = nullptr; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); if (!BOp) @@ -1545,20 +1559,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); } - return 0; -} - -namespace { - /// \brief Predicate tests whether a ValueEntry's op is in a map. - struct IsValueInMap { - const DenseMap<Value *, unsigned> ⤅ - - IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {} - - bool operator()(const ValueEntry &Entry) { - return Map.find(Entry.Op) != Map.end(); - } - }; + return nullptr; } /// \brief Build up a vector of value/power pairs factoring a product. @@ -1619,7 +1620,7 @@ bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, // below our mininum of '4'. assert(FactorPowerSum >= 4); - std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); + std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); return true; } @@ -1703,14 +1704,14 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I, // We can only optimize the multiplies when there is a chain of more than // three, such that a balanced tree might require fewer total multiplies. if (Ops.size() < 4) - return 0; + return nullptr; // Try to turn linear trees of multiplies without other uses of the // intermediate stages into minimal multiply DAGs with perfect sub-expression // re-use. SmallVector<Factor, 4> Factors; if (!collectMultiplyFactors(Ops, Factors)) - return 0; // All distinct factors, so nothing left for us to do. + return nullptr; // All distinct factors, so nothing left for us to do. IRBuilder<> Builder(I); Value *V = buildMinimalMultiplyDAG(Builder, Factors); @@ -1719,14 +1720,14 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I, ValueEntry NewEntry = ValueEntry(getRank(V), V); Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry); - return 0; + return nullptr; } Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - Constant *Cst = 0; + Constant *Cst = nullptr; unsigned Opcode = I->getOpcode(); while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { Constant *C = cast<Constant>(Ops.pop_back_val().Op); @@ -1776,7 +1777,7 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, if (Ops.size() != NumOps) return OptimizeExpression(I, Ops); - return 0; + return nullptr; } /// EraseInst - Zap the given instruction, adding interesting operands to the @@ -1795,9 +1796,9 @@ void Reassociate::EraseInst(Instruction *I) { // If this is a node in an expression tree, climb to the expression root // and add that since that's where optimization actually happens. unsigned Opcode = Op->getOpcode(); - while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode && + while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode && Visited.insert(Op)) - Op = Op->use_back(); + Op = Op->user_back(); RedoInsts.insert(Op); } } @@ -1815,8 +1816,8 @@ void Reassociate::OptimizeInst(Instruction *I) { // is used by a reassociable multiply or add, turn into a multiply. if (isReassociableOp(I->getOperand(0), Instruction::Mul) || (I->hasOneUse() && - (isReassociableOp(I->use_back(), Instruction::Mul) || - isReassociableOp(I->use_back(), Instruction::Add)))) { + (isReassociableOp(I->user_back(), Instruction::Mul) || + isReassociableOp(I->user_back(), Instruction::Add)))) { Instruction *NI = ConvertShiftToMul(I); RedoInsts.insert(I); MadeChange = true; @@ -1869,7 +1870,7 @@ void Reassociate::OptimizeInst(Instruction *I) { // and if this is not an inner node of a multiply tree. if (isReassociableOp(I->getOperand(1), Instruction::Mul) && (!I->hasOneUse() || - !isReassociableOp(I->use_back(), Instruction::Mul))) { + !isReassociableOp(I->user_back(), Instruction::Mul))) { Instruction *NI = LowerNegateToMultiply(I); RedoInsts.insert(I); MadeChange = true; @@ -1885,13 +1886,13 @@ void Reassociate::OptimizeInst(Instruction *I) { // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. unsigned Opcode = BO->getOpcode(); - if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode) + if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) return; // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && - cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub) + cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub) return; ReassociateExpression(BO); @@ -1943,7 +1944,7 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { // In this case we reassociate to put the negation on the outside so that we // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y if (I->getOpcode() == Instruction::Mul && I->hasOneUse() && - cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add && + cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add && isa<ConstantInt>(Ops.back().Op) && cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { ValueEntry Tmp = Ops.pop_back_val(); @@ -1972,6 +1973,9 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { } bool Reassociate::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + // Calculate the rank map for F BuildRankMap(F); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 07f540a30127..b6023e2ce789 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -16,20 +16,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "reg2mem" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; +#define DEBUG_TYPE "reg2mem" + STATISTIC(NumRegsDemoted, "Number of registers demoted"); STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted"); @@ -40,23 +41,22 @@ namespace { initializeRegToMemPass(*PassRegistry::getPassRegistry()); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(BreakCriticalEdgesID); AU.addPreservedID(BreakCriticalEdgesID); } - bool valueEscapes(const Instruction *Inst) const { - const BasicBlock *BB = Inst->getParent(); - for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end(); - UI != E; ++UI) { - const Instruction *I = cast<Instruction>(*UI); - if (I->getParent() != BB || isa<PHINode>(I)) + bool valueEscapes(const Instruction *Inst) const { + const BasicBlock *BB = Inst->getParent(); + for (const User *U : Inst->users()) { + const Instruction *UI = cast<Instruction>(U); + if (UI->getParent() != BB || isa<PHINode>(UI)) return true; } return false; } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; }; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 43647207c2cc..90c3520c8323 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -17,7 +17,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sccp" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -26,13 +25,13 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/InstVisitor.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -42,6 +41,8 @@ #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "sccp" + STATISTIC(NumInstRemoved, "Number of instructions removed"); STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); @@ -81,7 +82,7 @@ class LatticeVal { } public: - LatticeVal() : Val(0, undefined) {} + LatticeVal() : Val(nullptr, undefined) {} bool isUndefined() const { return getLatticeValue() == undefined; } bool isConstant() const { @@ -133,7 +134,7 @@ public: ConstantInt *getConstantInt() const { if (isConstant()) return dyn_cast<ConstantInt>(getConstant()); - return 0; + return nullptr; } void markForcedConstant(Constant *V) { @@ -153,7 +154,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -205,8 +206,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli) - : TD(td), TLI(tli) {} + SCCPSolver(const DataLayout *DL, const TargetLibraryInfo *tli) + : DL(DL), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of /// the blocks that are known to be intrinsically live in the processed unit. @@ -403,7 +404,7 @@ private: if (Constant *C = dyn_cast<Constant>(V)) { Constant *Elt = C->getAggregateElement(i); - if (Elt == 0) + if (!Elt) LV.markOverdefined(); // Unknown sort of constant. else if (isa<UndefValue>(Elt)) ; // Undef values remain undefined. @@ -491,10 +492,11 @@ private: } void visitCallSite (CallSite CS); void visitResumeInst (TerminatorInst &I) { /*returns void*/ } - void visitUnwindInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } - void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); } + void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + markAnythingOverdefined(&I); + } void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } void visitAllocaInst (Instruction &I) { markOverdefined(&I); } void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } @@ -523,7 +525,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, LatticeVal BCValue = getValueState(BI->getCondition()); ConstantInt *CI = BCValue.getConstantInt(); - if (CI == 0) { + if (!CI) { // Overdefined condition variables, and branches on unfoldable constant // conditions, mean the branch could go either way. if (!BCValue.isUndefined()) @@ -550,7 +552,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); - if (CI == 0) { // Overdefined or undefined condition? + if (!CI) { // Overdefined or undefined condition? // All destinations are executable! if (!SCValue.isUndefined()) Succs.assign(TI.getNumSuccessors(), true); @@ -595,7 +597,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { // Overdefined condition variables mean the branch could go either way, // undef conditions mean that neither edge is feasible yet. ConstantInt *CI = BCValue.getConstantInt(); - if (CI == 0) + if (!CI) return !BCValue.isUndefined(); // Constant condition variables mean the branch can only go a single way. @@ -613,7 +615,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); - if (CI == 0) + if (!CI) return !SCValue.isUndefined(); return SI->findCaseValue(CI).getCaseSuccessor() == To; @@ -627,7 +629,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { #ifndef NDEBUG dbgs() << "Unknown terminator instruction: " << *TI << '\n'; #endif - llvm_unreachable(0); + llvm_unreachable(nullptr); } // visit Implementations - Something changed in this instruction, either an @@ -668,7 +670,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // constant. If they are constant and don't agree, the PHI is overdefined. // If there are no executable operands, the PHI remains undefined. // - Constant *OperandVal = 0; + Constant *OperandVal = nullptr; for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { LatticeVal IV = getValueState(PN.getIncomingValue(i)); if (IV.isUndefined()) continue; // Doesn't influence PHI node. @@ -679,7 +681,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { if (IV.isOverdefined()) // PHI node becomes overdefined! return markOverdefined(&PN); - if (OperandVal == 0) { // Grab the first value. + if (!OperandVal) { // Grab the first value. OperandVal = IV.getConstant(); continue; } @@ -775,7 +777,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { StructType *STy = dyn_cast<StructType>(IVI.getType()); - if (STy == 0) + if (!STy) return markOverdefined(&IVI); // If this has more than one index, we can't handle it, drive all results to @@ -863,7 +865,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // If this is an AND or OR with 0 or -1, it doesn't matter that the other // operand is overdefined. if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) { - LatticeVal *NonOverdefVal = 0; + LatticeVal *NonOverdefVal = nullptr; if (!V1State.isOverdefined()) NonOverdefVal = &V1State; else if (!V2State.isOverdefined()) @@ -1067,7 +1069,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { } // Transform load from a constant into a constant if possible. - if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, TD)) + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) return markConstant(IV, &I, C); // Otherwise we cannot say for certain what value this load will produce. @@ -1082,7 +1084,7 @@ void SCCPSolver::visitCallSite(CallSite CS) { // The common case is that we aren't tracking the callee, either because we // are not doing interprocedural analysis or the callee is indirect, or is // external. Handle these cases first. - if (F == 0 || F->isDeclaration()) { + if (!F || F->isDeclaration()) { CallOverdefined: // Void return and not tracking callee, just bail. if (I->getType()->isVoidTy()) return; @@ -1181,10 +1183,9 @@ void SCCPSolver::Solve() { // since all of its users will have already been marked as overdefined // Update all of the users of this instruction's value. // - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - if (Instruction *I = dyn_cast<Instruction>(*UI)) - OperandChangedState(I); + for (User *U : I->users()) + if (Instruction *UI = dyn_cast<Instruction>(U)) + OperandChangedState(UI); } // Process the instruction work list. @@ -1201,10 +1202,9 @@ void SCCPSolver::Solve() { // Update all of the users of this instruction's value. // if (I->getType()->isStructTy() || !getValueState(I).isOverdefined()) - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - if (Instruction *I = dyn_cast<Instruction>(*UI)) - OperandChangedState(I); + for (User *U : I->users()) + if (Instruction *UI = dyn_cast<Instruction>(U)) + OperandChangedState(UI); } // Process the basic block work list. @@ -1499,7 +1499,7 @@ namespace { /// Sparse Conditional Constant Propagator. /// struct SCCP : public FunctionPass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfo>(); } static char ID; // Pass identification, replacement for typeid @@ -1510,7 +1510,7 @@ namespace { // runOnFunction - Run the Sparse Conditional Constant Propagation // algorithm, and return true if the function was modified. // - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; }; } // end anonymous namespace @@ -1553,10 +1553,14 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { // and return true if the function was modified. // bool SCCP::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - SCCPSolver Solver(TD, TLI); + SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. Solver.MarkBlockExecutable(F.begin()); @@ -1628,14 +1632,14 @@ namespace { /// Constant Propagation. /// struct IPSCCP : public ModulePass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfo>(); } static char ID; IPSCCP() : ModulePass(ID) { initializeIPSCCPPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; }; } // end anonymous namespace @@ -1658,21 +1662,20 @@ static bool AddressIsTaken(const GlobalValue *GV) { // Delete any dead constantexpr klingons. GV->removeDeadConstantUsers(); - for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); - UI != E; ++UI) { - const User *U = *UI; - if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + for (const Use &U : GV->uses()) { + const User *UR = U.getUser(); + if (const StoreInst *SI = dyn_cast<StoreInst>(UR)) { if (SI->getOperand(0) == GV || SI->isVolatile()) return true; // Storing addr of GV. - } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) { + } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) { // Make sure we are calling the function, not passing the address. - ImmutableCallSite CS(cast<Instruction>(U)); - if (!CS.isCallee(UI)) + ImmutableCallSite CS(cast<Instruction>(UR)); + if (!CS.isCallee(&U)) return true; - } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + } else if (const LoadInst *LI = dyn_cast<LoadInst>(UR)) { if (LI->isVolatile()) return true; - } else if (isa<BlockAddress>(U)) { + } else if (isa<BlockAddress>(UR)) { // blockaddress doesn't take the address of the function, it takes addr // of label. } else { @@ -1683,9 +1686,10 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - SCCPSolver Solver(TD, TLI); + SCCPSolver Solver(DL, TLI); // AddressTakenFunctions - This set keeps track of the address-taken functions // that are in the input. As IPSCCP runs through and simplifies code, @@ -1834,8 +1838,9 @@ bool IPSCCP::runOnModule(Module &M) { for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) { // If there are any PHI nodes in this successor, drop entries for BB now. BasicBlock *DeadBB = BlocksToErase[i]; - for (Value::use_iterator UI = DeadBB->use_begin(), UE = DeadBB->use_end(); - UI != UE; ) { + for (Value::user_iterator UI = DeadBB->user_begin(), + UE = DeadBB->user_end(); + UI != UE;) { // Grab the user and then increment the iterator early, as the user // will be deleted. Step past all adjacent uses from the same user. Instruction *I = dyn_cast<Instruction>(*UI); @@ -1925,7 +1930,7 @@ bool IPSCCP::runOnModule(Module &M) { "Overdefined values should have been taken out of the map!"); DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); while (!GV->use_empty()) { - StoreInst *SI = cast<StoreInst>(GV->use_back()); + StoreInst *SI = cast<StoreInst>(GV->user_back()); SI->eraseFromParent(); } M.getGlobalList().erase(GV); diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 9f3fc83d129d..8c7f253290ba 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -23,40 +23,48 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sroa" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" -#include "llvm/InstVisitor.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TimeValue.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" + +#if __cplusplus >= 201103L && !defined(NDEBUG) +// We only use this for a debug check in C++11 +#include <random> +#endif + using namespace llvm; +#define DEBUG_TYPE "sroa" + STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); @@ -73,6 +81,16 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates"); static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); +/// Hidden option to enable randomly shuffling the slices to help uncover +/// instability in their order. +static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", + cl::init(false), cl::Hidden); + +/// Hidden option to experiment with completely strict handling of inbounds +/// GEPs. +static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", + cl::init(false), cl::Hidden); + namespace { /// \brief A custom IRBuilder inserter which prefixes all names if they are /// preserved. @@ -142,8 +160,8 @@ public: Use *getUse() const { return UseAndIsSplittable.getPointer(); } - bool isDead() const { return getUse() == 0; } - void kill() { UseAndIsSplittable.setPointer(0); } + bool isDead() const { return getUse() == nullptr; } + void kill() { UseAndIsSplittable.setPointer(nullptr); } /// \brief Support for ordering ranges. /// @@ -244,8 +262,8 @@ public: void printUse(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; void print(raw_ostream &OS) const; - void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; - void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; + void dump(const_iterator I) const; + void dump() const; #endif private: @@ -303,7 +321,7 @@ static Value *foldSelectInst(SelectInst &SI) { if (SI.getOperand(1) == SI.getOperand(2)) return SI.getOperand(1); - return 0; + return nullptr; } /// \brief Builder for the alloca slices. @@ -339,7 +357,7 @@ private: bool IsSplittable = false) { // Completely skip uses which have a zero size or start either before or // past the end of the allocation. - if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { + if (Size == 0 || Offset.uge(AllocSize)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which has zero size or starts outside of the " << AllocSize << " byte alloca:\n" @@ -380,6 +398,43 @@ private: if (GEPI.use_empty()) return markAsDead(GEPI); + if (SROAStrictInbounds && GEPI.isInBounds()) { + // FIXME: This is a manually un-factored variant of the basic code inside + // of GEPs with checking of the inbounds invariant specified in the + // langref in a very strict sense. If we ever want to enable + // SROAStrictInbounds, this code should be factored cleanly into + // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds + // by writing out the code here where we have tho underlying allocation + // size readily available. + APInt GEPOffset = Offset; + for (gep_type_iterator GTI = gep_type_begin(GEPI), + GTE = gep_type_end(GEPI); + GTI != GTE; ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); + if (!OpC) + break; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + unsigned ElementIdx = OpC->getZExtValue(); + const StructLayout *SL = DL.getStructLayout(STy); + GEPOffset += + APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); + } else { + // For array or vector indices, scale the index by the size of the type. + APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); + GEPOffset += Index * APInt(Offset.getBitWidth(), + DL.getTypeAllocSize(GTI.getIndexedType())); + } + + // If this index has computed an intermediate pointer which is not + // inbounds, then the result of the GEP is a poison value and we can + // delete it and all uses. + if (GEPOffset.ugt(AllocSize)) + return markAsDead(GEPI); + } + } + return Base::visitGetElementPtrInst(GEPI); } @@ -426,8 +481,7 @@ private: // risk of overflow. // FIXME: We should instead consider the pointer to have escaped if this // function is being instrumented for addressing bugs or race conditions. - if (Offset.isNegative() || Size > AllocSize || - Offset.ugt(AllocSize - Size)) { + if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset << " which extends past the end of the " << AllocSize << " byte alloca:\n" @@ -446,7 +500,7 @@ private: assert(II.getRawDest() == *U && "Pointer use is not the destination?"); ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + (IsOffsetKnown && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. return markAsDead(II); @@ -461,14 +515,30 @@ private: void visitMemTransferInst(MemTransferInst &II) { ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); - if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + if (Length && Length->getValue() == 0) // Zero-length mem transfer intrinsics can be ignored entirely. return markAsDead(II); + // Because we can visit these intrinsics twice, also check to see if the + // first time marked this instruction as dead. If so, skip it. + if (VisitedDeadInsts.count(&II)) + return; + if (!IsOffsetKnown) return PI.setAborted(&II); + // This side of the transfer is completely out-of-bounds, and so we can + // nuke the entire transfer. However, we also need to nuke the other side + // if already added to our partitions. + // FIXME: Yet another place we really should bypass this when + // instrumenting for ASan. + if (Offset.uge(AllocSize)) { + SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); + if (MTPI != MemTransferSliceMap.end()) + S.Slices[MTPI->second].kill(); + return markAsDead(II); + } + uint64_t RawOffset = Offset.getLimitedValue(); uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; @@ -487,7 +557,7 @@ private: // they both point to the same alloca. bool Inserted; SmallDenseMap<Instruction *, unsigned>::iterator MTPI; - llvm::tie(MTPI, Inserted) = + std::tie(MTPI, Inserted) = MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); unsigned PrevIdx = MTPI->second; if (!Inserted) { @@ -546,7 +616,7 @@ private: Size = 0; do { Instruction *I, *UsedI; - llvm::tie(UsedI, I) = Uses.pop_back_val(); + std::tie(UsedI, I) = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); @@ -568,13 +638,12 @@ private: return I; } - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; - ++UI) - if (Visited.insert(cast<Instruction>(*UI))) - Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); + for (User *U : I->users()) + if (Visited.insert(cast<Instruction>(U))) + Uses.push_back(std::make_pair(I, cast<Instruction>(U))); } while (!Uses.empty()); - return 0; + return nullptr; } void visitPHINode(PHINode &PN) { @@ -597,8 +666,7 @@ private: // themselves which should be replaced with undef. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. - if ((Offset.isNegative() && (-Offset).uge(PHISize)) || - (!Offset.isNegative() && Offset.uge(AllocSize))) { + if (Offset.uge(AllocSize)) { S.DeadOperands.push_back(U); return; } @@ -638,8 +706,7 @@ private: // themselves which should be replaced with undef. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. - if ((Offset.isNegative() && Offset.uge(SelectSize)) || - (!Offset.isNegative() && Offset.uge(AllocSize))) { + if (Offset.uge(AllocSize)) { S.DeadOperands.push_back(U); return; } @@ -658,7 +725,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif - PointerEscapingInstr(0) { + PointerEscapingInstr(nullptr) { SliceBuilder PB(DL, AI, *this); SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { @@ -674,6 +741,13 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) std::mem_fun_ref(&Slice::isDead)), Slices.end()); +#if __cplusplus >= 201103L && !defined(NDEBUG) + if (SROARandomShuffleSlices) { + std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec())); + std::shuffle(Slices.begin(), Slices.end(), MT); + } +#endif + // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. std::sort(Slices.begin(), Slices.end()); @@ -712,8 +786,10 @@ void AllocaSlices::print(raw_ostream &OS) const { print(OS, I); } -void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); } -void AllocaSlices::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { + print(dbgs(), I); +} +LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -741,12 +817,10 @@ public: // Retain the debug information attached to the alloca for use when // rewriting loads and stores. if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { - for (Value::use_iterator UI = DebugNode->use_begin(), - UE = DebugNode->use_end(); - UI != UE; ++UI) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) DVIs.push_back(DVI); } @@ -760,8 +834,8 @@ public: DVIs.pop_back_val()->eraseFromParent(); } - virtual bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const { + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -788,7 +862,7 @@ public: return false; } - virtual void updateDebugInfo(Instruction *Inst) const { + void updateDebugInfo(Instruction *Inst) const override { for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; @@ -800,7 +874,7 @@ public: for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; - Value *Arg = 0; + Value *Arg = nullptr; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. @@ -896,13 +970,13 @@ class SROA : public FunctionPass { public: SROA(bool RequiresDomTree = true) : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(0), DL(0), DT(0) { + C(nullptr), DL(nullptr), DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); - void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; - const char *getPassName() const { return "SROA"; } + const char *getPassName() const override { return "SROA"; } static char ID; private: @@ -915,6 +989,7 @@ private: ArrayRef<AllocaSlices::iterator> SplitUses); bool splitAlloca(AllocaInst &AI, AllocaSlices &S); bool runOnAlloca(AllocaInst &AI); + void clobberUse(Use &U); void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); bool promoteAllocas(Function &F); }; @@ -928,7 +1003,7 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) @@ -937,8 +1012,12 @@ INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", static Type *findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset) { - Type *Ty = 0; - bool IgnoreNonIntegralTypes = false; + Type *Ty = nullptr; + bool TyIsCommon = true; + IntegerType *ITy = nullptr; + + // Note that we need to look at *every* alloca slice's Use to ensure we + // always get consistent results regardless of the order of slices. for (AllocaSlices::const_iterator I = B; I != E; ++I) { Use *U = I->getUse(); if (isa<IntrinsicInst>(*U->getUser())) @@ -946,42 +1025,37 @@ static Type *findCommonType(AllocaSlices::const_iterator B, if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) continue; - Type *UserTy = 0; + Type *UserTy = nullptr; if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { UserTy = LI->getType(); } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { UserTy = SI->getValueOperand()->getType(); - } else { - IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. - continue; } - if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { // If the type is larger than the partition, skip it. We only encounter // this for split integer operations where we want to use the type of the // entity causing the split. Also skip if the type is not a byte width // multiple. - if (ITy->getBitWidth() % 8 != 0 || - ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) + if (UserITy->getBitWidth() % 8 != 0 || + UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) continue; - // If we have found an integer type use covering the alloca, use that - // regardless of the other types, as integers are often used for - // a "bucket of bits" type. - // - // NB: This *must* be the only return from inside the loop so that the - // order of slices doesn't impact the computed type. - return ITy; - } else if (IgnoreNonIntegralTypes) { - continue; + // Track the largest bitwidth integer type used in this way in case there + // is no common type. + if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) + ITy = UserITy; } - if (Ty && Ty != UserTy) - IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. - - Ty = UserTy; + // To avoid depending on the order of slices, Ty and TyIsCommon must not + // depend on types skipped above. + if (!UserTy || (Ty && Ty != UserTy)) + TyIsCommon = false; // Give up on anything but an iN type. + else + Ty = UserTy; } - return Ty; + + return TyIsCommon ? Ty : ITy; } /// PHI instructions that use an alloca and are subsequently loaded can be @@ -1003,7 +1077,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h static bool isSafePHIToSpeculate(PHINode &PN, - const DataLayout *DL = 0) { + const DataLayout *DL = nullptr) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1011,10 +1085,9 @@ static bool isSafePHIToSpeculate(PHINode &PN, BasicBlock *BB = PN.getParent(); unsigned MaxAlign = 0; bool HaveLoad = false; - for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE; - ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) + for (User *U : PN.users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is @@ -1057,7 +1130,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (InVal->isDereferenceablePointer() || + if (InVal->isDereferenceablePointer(DL) || isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) continue; @@ -1077,13 +1150,13 @@ static void speculatePHINodeLoads(PHINode &PN) { // Get the TBAA tag and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ. - LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin()); + LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN.use_empty()) { - LoadInst *LI = cast<LoadInst>(*PN.use_begin()); + LoadInst *LI = cast<LoadInst>(PN.user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } @@ -1121,16 +1194,16 @@ static void speculatePHINodeLoads(PHINode &PN) { /// /// We can do this to a select if its only uses are loads and if the operand /// to the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) { +static bool isSafeSelectToSpeculate(SelectInst &SI, + const DataLayout *DL = nullptr) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); + bool TDerefable = TValue->isDereferenceablePointer(DL); + bool FDerefable = FValue->isDereferenceablePointer(DL); - for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE; - ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) + for (User *U : SI.users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either @@ -1155,7 +1228,7 @@ static void speculateSelectInstLoads(SelectInst &SI) { Value *FV = SI.getFalseValue(); // Replace the loads of the select with a select of two loads. while (!SI.use_empty()) { - LoadInst *LI = cast<LoadInst>(*SI.use_begin()); + LoadInst *LI = cast<LoadInst>(SI.user_back()); assert(LI->isSimple() && "We only speculate simple loads"); IRB.SetInsertPoint(LI); @@ -1188,7 +1261,7 @@ static void speculateSelectInstLoads(SelectInst &SI) { /// This will return the BasePtr if that is valid, or build a new GEP /// instruction using the IRBuilder if GEP-ing is needed. static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { if (Indices.empty()) return BasePtr; @@ -1197,7 +1270,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) return BasePtr; - return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx"); + return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx"); } /// \brief Get a natural GEP off of the BasePtr walking through Ty toward @@ -1211,9 +1284,13 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, /// indicated by Indices to have the correct offset. static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, + Twine NamePrefix) { if (Ty == TargetTy) - return buildGEP(IRB, BasePtr, Indices); + return buildGEP(IRB, BasePtr, Indices, NamePrefix); + + // Pointer size to use for the indices. + unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType()); // See if we can descend into a struct and locate a field with the correct // type. @@ -1222,11 +1299,13 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, do { if (ElementTy->isPointerTy()) break; - if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { - ElementTy = SeqTy->getElementType(); - // Note that we use the default address space as this index is over an - // array or a vector, not a pointer. - Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0))); + + if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) { + ElementTy = ArrayTy->getElementType(); + Indices.push_back(IRB.getIntN(PtrSize, 0)); + } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) { + ElementTy = VectorTy->getElementType(); + Indices.push_back(IRB.getInt32(0)); } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. @@ -1240,7 +1319,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, if (ElementTy != TargetTy) Indices.erase(Indices.end() - NumLayers, Indices.end()); - return buildGEP(IRB, BasePtr, Indices); + return buildGEP(IRB, BasePtr, Indices, NamePrefix); } /// \brief Recursively compute indices for a natural GEP. @@ -1250,29 +1329,32 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, + Twine NamePrefix) { if (Offset == 0) - return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); // We can't recurse through pointer types. if (Ty->isPointerTy()) - return 0; + return nullptr; // We try to analyze GEPs over vectors here, but note that these GEPs are // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); - if (ElementSizeInBits % 8) - return 0; // GEPs over non-multiple of 8 size vector elements are invalid. + if (ElementSizeInBits % 8 != 0) { + // GEPs over non-multiple of 8 size vector elements are invalid. + return nullptr; + } APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(VecTy->getNumElements())) - return 0; + return nullptr; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), - Offset, TargetTy, Indices); + Offset, TargetTy, Indices, NamePrefix); } if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { @@ -1280,31 +1362,31 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) - return 0; + return nullptr; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices); + Indices, NamePrefix); } StructType *STy = dyn_cast<StructType>(Ty); if (!STy) - return 0; + return nullptr; const StructLayout *SL = DL.getStructLayout(STy); uint64_t StructOffset = Offset.getZExtValue(); if (StructOffset >= SL->getSizeInBytes()) - return 0; + return nullptr; unsigned Index = SL->getElementContainingOffset(StructOffset); Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); Type *ElementTy = STy->getElementType(Index); if (Offset.uge(DL.getTypeAllocSize(ElementTy))) - return 0; // The offset points into alignment padding. + return nullptr; // The offset points into alignment padding. Indices.push_back(IRB.getInt32(Index)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices); + Indices, NamePrefix); } /// \brief Get a natural GEP from a base pointer to a particular offset and @@ -1319,26 +1401,27 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, /// If no natural GEP can be constructed, this function returns null. static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *TargetTy, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, + Twine NamePrefix) { PointerType *Ty = cast<PointerType>(Ptr->getType()); // Don't consider any GEPs through an i8* as natural unless the TargetTy is // an i8. - if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) - return 0; + if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) + return nullptr; Type *ElementTy = Ty->getElementType(); if (!ElementTy->isSized()) - return 0; // We can't GEP through an unsized element. + return nullptr; // We can't GEP through an unsized element. APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); if (ElementSize == 0) - return 0; // Zero-length arrays can't help us build a natural GEP. + return nullptr; // Zero-length arrays can't help us build a natural GEP. APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices); + Indices, NamePrefix); } /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the @@ -1356,8 +1439,9 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, /// properties. The algorithm tries to fold as many constant indices into /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. -static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, - Value *Ptr, APInt Offset, Type *PointerTy) { +static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, + APInt Offset, Type *PointerTy, + Twine NamePrefix) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet<Value *, 4> Visited; @@ -1367,11 +1451,11 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll // fall back to it, so keep it around here. - Value *OffsetPtr = 0; + Value *OffsetPtr = nullptr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. - Value *Int8Ptr = 0; + Value *Int8Ptr = nullptr; APInt Int8PtrOffset(Offset.getBitWidth(), 0); Type *TargetTy = PointerTy->getPointerElementType(); @@ -1391,7 +1475,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, // See if we can perform a natural GEP here. Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, - Indices)) { + Indices, NamePrefix)) { if (P->getType() == PointerTy) { // Zap any offset pointer that we ended up computing in previous rounds. if (OffsetPtr && OffsetPtr->use_empty()) @@ -1425,20 +1509,21 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, if (!OffsetPtr) { if (!Int8Ptr) { - Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), - "raw_cast"); + Int8Ptr = IRB.CreateBitCast( + Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), + NamePrefix + "sroa_raw_cast"); Int8PtrOffset = Offset; } OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), - "raw_idx"); + NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; // On the off chance we were targeting i8*, guard the bitcast here. if (Ptr->getType() != PointerTy) - Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast"); + Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast"); return Ptr; } @@ -1931,16 +2016,22 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { // integer type will be stored here for easy access during rewriting. IntegerType *IntTy; - // The offset of the slice currently being rewritten. + // The original offset of the slice currently being rewritten relative to + // the original alloca. uint64_t BeginOffset, EndOffset; + // The new offsets of the slice currently being rewritten relative to the + // original alloca. + uint64_t NewBeginOffset, NewEndOffset; + + uint64_t SliceSize; bool IsSplittable; bool IsSplit; Use *OldUse; Instruction *OldPtr; - // Output members carrying state about the result of visiting and rewriting - // the slice of the alloca. - bool IsUsedByRewrittenSpeculatableInstructions; + // Track post-rewrite users which are PHI nodes and Selects. + SmallPtrSetImpl<PHINode *> &PHIUsers; + SmallPtrSetImpl<SelectInst *> &SelectUsers; // Utility IR builder, whose name prefix is setup for each visited use, and // the insertion point is set to point to the user. @@ -1949,22 +2040,25 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { public: AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, - uint64_t NewBeginOffset, uint64_t NewEndOffset, - bool IsVectorPromotable = false, - bool IsIntegerPromotable = false) + uint64_t NewAllocaBeginOffset, + uint64_t NewAllocaEndOffset, bool IsVectorPromotable, + bool IsIntegerPromotable, + SmallPtrSetImpl<PHINode *> &PHIUsers, + SmallPtrSetImpl<SelectInst *> &SelectUsers) : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), - NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset), + NewAllocaBeginOffset(NewAllocaBeginOffset), + NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAI.getAllocatedType()), - VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0), - ElementTy(VecTy ? VecTy->getElementType() : 0), + VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr), + ElementTy(VecTy ? VecTy->getElementType() : nullptr), ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), IntTy(IsIntegerPromotable ? Type::getIntNTy( NewAI.getContext(), DL.getTypeSizeInBits(NewAI.getAllocatedType())) - : 0), + : nullptr), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), - OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false), + OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { if (VecTy) { assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && @@ -1983,6 +2077,14 @@ public: IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + + SliceSize = NewEndOffset - NewBeginOffset; + OldUse = I->getUse(); OldPtr = cast<Instruction>(OldUse->get()); @@ -1997,20 +2099,6 @@ public: return CanSROA; } - /// \brief Query whether this slice is used by speculatable instructions after - /// rewriting. - /// - /// These instructions (PHIs and Selects currently) require the alloca slice - /// to run back through the rewriter. Thus, they are promotable, but not on - /// this iteration. This is distinct from a slice which is unpromotable for - /// some other reason, in which case we don't even want to perform the - /// speculation. This can be querried at any time and reflects whether (at - /// that point) a visit call has rewritten a speculatable instruction on the - /// current slice. - bool isUsedByRewrittenSpeculatableInstructions() const { - return IsUsedByRewrittenSpeculatableInstructions; - } - private: // Make sure the other visit overloads are visible. using Base::visit; @@ -2021,30 +2109,53 @@ private: llvm_unreachable("No rewrite rule for this instruction!"); } - Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset, - Type *PointerTy) { - assert(Offset >= NewAllocaBeginOffset); - return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(), - Offset - NewAllocaBeginOffset), - PointerTy); + Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { + // Note that the offset computation can use BeginOffset or NewBeginOffset + // interchangeably for unsplit slices. + assert(IsSplit || BeginOffset == NewBeginOffset); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + +#ifndef NDEBUG + StringRef OldName = OldPtr->getName(); + // Skip through the last '.sroa.' component of the name. + size_t LastSROAPrefix = OldName.rfind(".sroa."); + if (LastSROAPrefix != StringRef::npos) { + OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); + // Look for an SROA slice index. + size_t IndexEnd = OldName.find_first_not_of("0123456789"); + if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { + // Strip the index and look for the offset. + OldName = OldName.substr(IndexEnd + 1); + size_t OffsetEnd = OldName.find_first_not_of("0123456789"); + if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') + // Strip the offset. + OldName = OldName.substr(OffsetEnd + 1); + } + } + // Strip any SROA suffixes as well. + OldName = OldName.substr(0, OldName.find(".sroa_")); +#endif + + return getAdjustedPtr(IRB, DL, &NewAI, + APInt(DL.getPointerSizeInBits(), Offset), PointerTy, +#ifndef NDEBUG + Twine(OldName) + "." +#else + Twine() +#endif + ); } - /// \brief Compute suitable alignment to access an offset into the new alloca. - unsigned getOffsetAlign(uint64_t Offset) { + /// \brief Compute suitable alignment to access this slice of the *new* alloca. + /// + /// You can optionally pass a type to this routine and if that type's ABI + /// alignment is itself suitable, this will return zero. + unsigned getSliceAlign(Type *Ty = nullptr) { unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - return MinAlign(NewAIAlign, Offset); - } - - /// \brief Compute suitable alignment to access a type at an offset of the - /// new alloca. - /// - /// \returns zero if the type's ABI alignment is a suitable alignment, - /// otherwise returns the maximal suitable alignment. - unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { - unsigned Align = getOffsetAlign(Offset); - return Align == DL.getABITypeAlignment(Ty) ? 0 : Align; + unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; } unsigned getIndex(uint64_t Offset) { @@ -2062,8 +2173,7 @@ private: Pass.DeadInsts.insert(I); } - Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset, - uint64_t NewEndOffset) { + Value *rewriteVectorizedLoadInst() { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); @@ -2073,8 +2183,7 @@ private: return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } - Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset, - uint64_t NewEndOffset) { + Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), @@ -2093,32 +2202,23 @@ private: Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - - uint64_t Size = NewEndOffset - NewBeginOffset; - - Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8) + Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); bool IsPtrAdjusted = false; Value *V; if (VecTy) { - V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset); + V = rewriteVectorizedLoadInst(); } else if (IntTy && LI.getType()->isIntegerTy()) { - V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset); + V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && canConvertValue(DL, NewAllocaTy, LI.getType())) { V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - LI.isVolatile(), "load"); + LI.isVolatile(), LI.getName()); } else { Type *LTy = TargetTy->getPointerTo(); - V = IRB.CreateAlignedLoad( - getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy), - getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset), - LI.isVolatile(), "load"); + V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), + getSliceAlign(TargetTy), LI.isVolatile(), + LI.getName()); IsPtrAdjusted = true; } V = convertValue(DL, IRB, V, TargetTy); @@ -2127,13 +2227,13 @@ private: assert(!LI.isVolatile()); assert(LI.getType()->isIntegerTy() && "Only integer type loads and stores are split"); - assert(Size < DL.getTypeStoreSize(LI.getType()) && + assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); + IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving @@ -2155,9 +2255,7 @@ private: return !LI.isVolatile() && !IsPtrAdjusted; } - bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, - uint64_t NewBeginOffset, - uint64_t NewEndOffset) { + bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) { if (V->getType() != VecTy) { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); @@ -2183,8 +2281,7 @@ private: return true; } - bool rewriteIntegerStore(Value *V, StoreInst &SI, - uint64_t NewBeginOffset, uint64_t NewEndOffset) { + bool rewriteIntegerStore(Value *V, StoreInst &SI) { assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { @@ -2217,30 +2314,22 @@ private: if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - - uint64_t Size = NewEndOffset - NewBeginOffset; - if (Size < DL.getTypeStoreSize(V->getType())) { + if (SliceSize < DL.getTypeStoreSize(V->getType())) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); - IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); + IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract"); } if (VecTy) - return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset, - NewEndOffset); + return rewriteVectorizedStoreInst(V, SI, OldOp); if (IntTy && V->getType()->isIntegerTy()) - return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset); + return rewriteIntegerStore(V, SI); StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && @@ -2250,12 +2339,9 @@ private: NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); } else { - Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset, - V->getType()->getPointerTo()); - NewSI = IRB.CreateAlignedStore( - V, NewPtr, getOffsetTypeAlign( - V->getType(), NewBeginOffset - NewAllocaBeginOffset), - SI.isVolatile()); + Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), + SI.isVolatile()); } (void)NewSI; Pass.DeadInsts.insert(&SI); @@ -2307,11 +2393,10 @@ private: // pointer to the new alloca. if (!isa<Constant>(II.getLength())) { assert(!IsSplit); - assert(BeginOffset >= NewAllocaBeginOffset); - II.setDest( - getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); + assert(NewBeginOffset == BeginOffset); + II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset))); + II.setAlignment(ConstantInt::get(CstTy, getSliceAlign())); deleteIfTriviallyDead(OldPtr); return false; @@ -2323,13 +2408,6 @@ private: Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; - // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && @@ -2341,8 +2419,8 @@ private: Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( - getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()), - II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile()); + getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, + getSliceAlign(), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; @@ -2419,25 +2497,11 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - - assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); - bool IsDest = II.getRawDest() == OldPtr; + bool IsDest = &II.getRawDestUse() == OldUse; + assert((IsDest && II.getRawDest() == OldPtr) || + (!IsDest && II.getRawSource() == OldPtr)); - // Compute the relative offset within the transfer. - unsigned IntPtrWidth = DL.getPointerSizeInBits(); - APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset); - - unsigned Align = II.getAlignment(); - uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; - if (Align > 1) - Align = - MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), - MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset))); + unsigned SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2447,19 +2511,20 @@ private: // memcpy, and so simply updating the pointers is the necessary for us to // update both source and dest of a single call. if (!IsSplittable) { - Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); + Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); if (IsDest) - II.setDest( - getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); + II.setDest(AdjustedPtr); else - II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset, - II.getRawSource()->getType())); + II.setSource(AdjustedPtr); - Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, Align)); + if (II.getAlignment() > SliceAlign) { + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment( + ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign))); + } DEBUG(dbgs() << " to: " << II << "\n"); - deleteIfTriviallyDead(OldOp); + deleteIfTriviallyDead(OldPtr); return false; } // For split transfer intrinsics we have an incredibly useful assurance: @@ -2495,37 +2560,39 @@ private: // alloca that should be re-examined after rewriting this instruction. Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); if (AllocaInst *AI - = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) + = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { + assert(AI != &OldAI && AI != &NewAI && + "Splittable transfers cannot reach the same alloca on both ends."); Pass.Worklist.insert(AI); + } - if (EmitMemCpy) { - Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() - : II.getRawDest()->getType(); + Type *OtherPtrTy = OtherPtr->getType(); + unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); + // Compute the relative offset for the other pointer within the transfer. + unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS); + APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset); + unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1, + OtherOffset.zextOrTrunc(64).getZExtValue()); + + if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce // a single, simple GEP in most cases. - OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); + OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, + OtherPtr->getName() + "."); - Value *OurPtr = getAdjustedAllocaPtr( - IRB, NewBeginOffset, - IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType()); + Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); - CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, - IsDest ? OtherPtr : OurPtr, - Size, Align, II.isVolatile()); + CallInst *New = IRB.CreateMemCpy( + IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size, + MinAlign(SliceAlign, OtherAlign), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; } - // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy - // is equivalent to 1, but that isn't true if we end up rewriting this as - // a load or store. - if (!Align) - Align = 1; - bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset; uint64_t Size = NewEndOffset - NewBeginOffset; @@ -2533,24 +2600,32 @@ private: unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; IntegerType *SubIntTy - = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; + = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr; - Type *OtherPtrTy = NewAI.getType(); + // Reset the other pointer type to match the register type we're going to + // use, but using the address space of the original other pointer. if (VecTy && !IsWholeAlloca) { if (NumElements == 1) OtherPtrTy = VecTy->getElementType(); else OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); - OtherPtrTy = OtherPtrTy->getPointerTo(); + OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS); } else if (IntTy && !IsWholeAlloca) { - OtherPtrTy = SubIntTy->getPointerTo(); + OtherPtrTy = SubIntTy->getPointerTo(OtherAS); + } else { + OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS); } - Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); + Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, + OtherPtr->getName() + "."); + unsigned SrcAlign = OtherAlign; Value *DstPtr = &NewAI; - if (!IsDest) + unsigned DstAlign = SliceAlign; + if (!IsDest) { std::swap(SrcPtr, DstPtr); + std::swap(SrcAlign, DstAlign); + } Value *Src; if (VecTy && !IsWholeAlloca && !IsDest) { @@ -2564,7 +2639,7 @@ private: uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { - Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), + Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); } @@ -2582,7 +2657,7 @@ private: } StoreInst *Store = cast<StoreInst>( - IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); + IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); return !II.isVolatile(); @@ -2594,20 +2669,13 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getArgOperand(1) == OldPtr); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - // Record this instruction for deletion. Pass.DeadInsts.insert(&II); ConstantInt *Size = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); - Value *Ptr = - getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType()); + Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) New = IRB.CreateLifetimeStart(Ptr, Size); @@ -2628,28 +2696,22 @@ private: // as local as possible to the PHI. To do that, we re-use the location of // the old pointer, which necessarily must be in the right position to // dominate the PHI. - IRBuilderTy PtrBuilder(OldPtr); - PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + - "."); + IRBuilderTy PtrBuilder(IRB); + PtrBuilder.SetInsertPoint(OldPtr); + PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); - Value *NewPtr = - getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType()); + Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType()); // Replace the operands which were using the old pointer. std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); - // Check whether we can speculate this PHI node, and if so remember that - // fact and queue it up for another iteration after the speculation - // occurs. - if (isSafePHIToSpeculate(PN, &DL)) { - Pass.SpeculatablePHIs.insert(&PN); - IsUsedByRewrittenSpeculatableInstructions = true; - return true; - } - - return false; // PHIs can't be promoted on their own. + // PHIs can't be promoted on their own, but often can be speculated. We + // check the speculation outside of the rewriter so that we see the + // fully-rewritten alloca. + PHIUsers.insert(&PN); + return true; } bool visitSelectInst(SelectInst &SI) { @@ -2659,7 +2721,7 @@ private: assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); - Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType()); + Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); // Replace the operands which were using the old pointer. if (SI.getOperand(1) == OldPtr) SI.setOperand(1, NewPtr); @@ -2669,16 +2731,11 @@ private: DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldPtr); - // Check whether we can speculate this select instruction, and if so - // remember that fact and queue it up for another iteration after the - // speculation occurs. - if (isSafeSelectToSpeculate(SI, &DL)) { - Pass.SpeculatableSelects.insert(&SI); - IsUsedByRewrittenSpeculatableInstructions = true; - return true; - } - - return false; // Selects can't be promoted on their own. + // Selects can't be promoted on their own, but often can be speculated. We + // check the speculation outside of the rewriter so that we see the + // fully-rewritten alloca. + SelectUsers.insert(&SI); + return true; } }; @@ -2726,10 +2783,9 @@ private: /// Enqueue all the users of the given instruction for further processing. /// This uses a set to de-duplicate users. void enqueueUsers(Instruction &I) { - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; - ++UI) - if (Visited.insert(*UI)) - Queue.push_back(&UI.getUse()); + for (Use &U : I.uses()) + if (Visited.insert(U.getUser())) + Queue.push_back(&U); } // Conservative default is to not rewrite anything. @@ -2942,22 +2998,22 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, return stripAggregateTypeWrapping(DL, Ty); if (Offset > DL.getTypeAllocSize(Ty) || (DL.getTypeAllocSize(Ty) - Offset) < Size) - return 0; + return nullptr; if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { // We can't partition pointers... if (SeqTy->isPointerTy()) - return 0; + return nullptr; Type *ElementTy = SeqTy->getElementType(); uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); uint64_t NumSkippedElements = Offset / ElementSize; if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) { if (NumSkippedElements >= ArrTy->getNumElements()) - return 0; + return nullptr; } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) { if (NumSkippedElements >= VecTy->getNumElements()) - return 0; + return nullptr; } Offset -= NumSkippedElements * ElementSize; @@ -2965,7 +3021,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, if (Offset > 0 || Size < ElementSize) { // Bail if the partition ends in a different array element. if ((Offset + Size) > ElementSize) - return 0; + return nullptr; // Recurse through the element type trying to peel off offset bytes. return getTypePartition(DL, ElementTy, Offset, Size); } @@ -2976,20 +3032,20 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, assert(Size > ElementSize); uint64_t NumElements = Size / ElementSize; if (NumElements * ElementSize != Size) - return 0; + return nullptr; return ArrayType::get(ElementTy, NumElements); } StructType *STy = dyn_cast<StructType>(Ty); if (!STy) - return 0; + return nullptr; const StructLayout *SL = DL.getStructLayout(STy); if (Offset >= SL->getSizeInBytes()) - return 0; + return nullptr; uint64_t EndOffset = Offset + Size; if (EndOffset > SL->getSizeInBytes()) - return 0; + return nullptr; unsigned Index = SL->getElementContainingOffset(Offset); Offset -= SL->getElementOffset(Index); @@ -2997,12 +3053,12 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, Type *ElementTy = STy->getElementType(Index); uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); if (Offset >= ElementSize) - return 0; // The offset points into alignment padding. + return nullptr; // The offset points into alignment padding. // See if any partition must be contained by the element. if (Offset > 0 || Size < ElementSize) { if ((Offset + Size) > ElementSize) - return 0; + return nullptr; return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); @@ -3015,14 +3071,14 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, if (EndOffset < SL->getSizeInBytes()) { unsigned EndIndex = SL->getElementContainingOffset(EndOffset); if (Index == EndIndex) - return 0; // Within a single element and its padding. + return nullptr; // Within a single element and its padding. // Don't try to form "natural" types if the elements don't line up with the // expected size. // FIXME: We could potentially recurse down through the last element in the // sub-struct to find a natural end point. if (SL->getElementOffset(EndIndex) != EndOffset) - return 0; + return nullptr; assert(Index < EndIndex); EE = STy->element_begin() + EndIndex; @@ -3033,7 +3089,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, STy->isPacked()); const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) - return 0; // The sub-struct doesn't have quite the size needed. + return nullptr; // The sub-struct doesn't have quite the size needed. return SubTy; } @@ -3058,7 +3114,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. - Type *SliceTy = 0; + Type *SliceTy = nullptr; if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) SliceTy = CommonUseTy; @@ -3105,7 +3161,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // the alloca's alignment unconstrained. if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = new AllocaInst(SliceTy, 0, Alignment, + NewAI = new AllocaInst(SliceTy, nullptr, Alignment, AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); ++NumNewAllocas; } @@ -3114,17 +3170,17 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI << "\n"); - // Track the high watermark on several worklists that are only relevant for + // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in // fact scheduled for promotion. unsigned PPWOldSize = PostPromotionWorklist.size(); - unsigned SPOldSize = SpeculatablePHIs.size(); - unsigned SSOldSize = SpeculatableSelects.size(); unsigned NumUses = 0; + SmallPtrSet<PHINode *, 8> PHIUsers; + SmallPtrSet<SelectInst *, 8> SelectUsers; AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, EndOffset, IsVectorPromotable, - IsIntegerPromotable); + IsIntegerPromotable, PHIUsers, SelectUsers); bool Promotable = true; for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), SUE = SplitUses.end(); @@ -3145,50 +3201,60 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, MaxUsesPerAllocaPartition = std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); - if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) { - DEBUG(dbgs() << " and queuing for promotion\n"); - PromotableAllocas.push_back(NewAI); - } else if (NewAI != &AI || - (Promotable && - Rewriter.isUsedByRewrittenSpeculatableInstructions())) { + // Now that we've processed all the slices in the new partition, check if any + // PHIs or Selects would block promotion. + for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), + E = PHIUsers.end(); + I != E; ++I) + if (!isSafePHIToSpeculate(**I, DL)) { + Promotable = false; + PHIUsers.clear(); + SelectUsers.clear(); + break; + } + for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), + E = SelectUsers.end(); + I != E; ++I) + if (!isSafeSelectToSpeculate(**I, DL)) { + Promotable = false; + PHIUsers.clear(); + SelectUsers.clear(); + break; + } + + if (Promotable) { + if (PHIUsers.empty() && SelectUsers.empty()) { + // Promote the alloca. + PromotableAllocas.push_back(NewAI); + } else { + // If we have either PHIs or Selects to speculate, add them to those + // worklists and re-queue the new alloca so that we promote in on the + // next iteration. + for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), + E = PHIUsers.end(); + I != E; ++I) + SpeculatablePHIs.insert(*I); + for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), + E = SelectUsers.end(); + I != E; ++I) + SpeculatableSelects.insert(*I); + Worklist.insert(NewAI); + } + } else { // If we can't promote the alloca, iterate on it to check for new // refinements exposed by splitting the current alloca. Don't iterate on an // alloca which didn't actually change and didn't get promoted. - // - // Alternatively, if we could promote the alloca but have speculatable - // instructions then we will speculate them after finishing our processing - // of the original alloca. Mark the new one for re-visiting in the next - // iteration so the speculated operations can be rewritten. - // - // FIXME: We should actually track whether the rewriter changed anything. - Worklist.insert(NewAI); - } - - // Drop any post-promotion work items if promotion didn't happen. - if (!Promotable) { + if (NewAI != &AI) + Worklist.insert(NewAI); + + // Drop any post-promotion work items if promotion didn't happen. while (PostPromotionWorklist.size() > PPWOldSize) PostPromotionWorklist.pop_back(); - while (SpeculatablePHIs.size() > SPOldSize) - SpeculatablePHIs.pop_back(); - while (SpeculatableSelects.size() > SSOldSize) - SpeculatableSelects.pop_back(); } return true; } -namespace { -struct IsSliceEndLessOrEqualTo { - uint64_t UpperBound; - - IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {} - - bool operator()(const AllocaSlices::iterator &I) { - return I->endOffset() <= UpperBound; - } -}; -} - static void removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { @@ -3200,7 +3266,9 @@ removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, size_t SplitUsesOldSize = SplitUses.size(); SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), - IsSliceEndLessOrEqualTo(Offset)), + [Offset](const AllocaSlices::iterator &I) { + return I->endOffset() <= Offset; + }), SplitUses.end()); if (SplitUsesOldSize == SplitUses.size()) return; @@ -3227,7 +3295,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { uint64_t BeginOffset = S.begin()->beginOffset(); - for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end(); + for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end(); SI != SE; SI = SJ) { uint64_t MaxEndOffset = SI->endOffset(); @@ -3326,6 +3394,21 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { return Changed; } +/// \brief Clobber a use with undef, deleting the used value if it becomes dead. +void SROA::clobberUse(Use &U) { + Value *OldV = U; + // Replace the use with an undef value. + U = UndefValue::get(OldV->getType()); + + // Check for this making an instruction dead. We have to garbage collect + // all the dead instructions to ensure the uses of any alloca end up being + // minimal. + if (Instruction *OldI = dyn_cast<Instruction>(OldV)) + if (isInstructionTriviallyDead(OldI)) { + DeadInsts.insert(OldI); + } +} + /// \brief Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds @@ -3363,21 +3446,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) { for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), DE = S.dead_user_end(); DI != DE; ++DI) { - Changed = true; + // Free up everything used by this instruction. + for (Use &DeadOp : (*DI)->operands()) + clobberUse(DeadOp); + + // Now replace the uses of this instruction. (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); + + // And mark it for deletion. DeadInsts.insert(*DI); + Changed = true; } for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), DE = S.dead_op_end(); DO != DE; ++DO) { - Value *OldV = **DO; - // Clobber the use with an undef value. - **DO = UndefValue::get(OldV->getType()); - if (Instruction *OldI = dyn_cast<Instruction>(OldV)) - if (isInstructionTriviallyDead(OldI)) { - Changed = true; - DeadInsts.insert(OldI); - } + clobberUse(**DO); + Changed = true; } // No slices to split. Leave the dead alloca for a later pass to clean up. @@ -3413,10 +3497,10 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { I->replaceAllUsesWith(UndefValue::get(I->getType())); - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast<Instruction>(*OI)) { + for (Use &Operand : I->operands()) + if (Instruction *U = dyn_cast<Instruction>(Operand)) { // Zero out the operand and see if it becomes trivially dead. - *OI = 0; + Operand = nullptr; if (isInstructionTriviallyDead(U)) DeadInsts.insert(U); } @@ -3432,10 +3516,9 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { static void enqueueUsersInWorklist(Instruction &I, SmallVectorImpl<Instruction *> &Worklist, SmallPtrSet<Instruction *, 8> &Visited) { - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; - ++UI) - if (Visited.insert(cast<Instruction>(*UI))) - Worklist.push_back(cast<Instruction>(*UI)); + for (User *U : I.users()) + if (Visited.insert(cast<Instruction>(U))) + Worklist.push_back(cast<Instruction>(U)); } /// \brief Promote the allocas, using the best available technique. @@ -3521,32 +3604,24 @@ bool SROA::promoteAllocas(Function &F) { return true; } -namespace { - /// \brief A predicate to test whether an alloca belongs to a set. - class IsAllocaInSet { - typedef SmallPtrSet<AllocaInst *, 4> SetType; - const SetType &Set; - - public: - typedef AllocaInst *argument_type; - - IsAllocaInSet(const SetType &Set) : Set(Set) {} - bool operator()(AllocaInst *AI) const { return Set.count(AI); } - }; -} - bool SROA::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DL = getAnalysisIfAvailable<DataLayout>(); - if (!DL) { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) { DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); return false; } - DT = getAnalysisIfAvailable<DominatorTree>(); + DL = &DLP->getDataLayout(); + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; BasicBlock &EntryBB = F.getEntryBlock(); - for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); + for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) Worklist.insert(AI); @@ -3564,11 +3639,14 @@ bool SROA::runOnFunction(Function &F) { // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { - Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); - PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); + auto IsInSet = [&](AllocaInst *AI) { + return DeletedAllocas.count(AI); + }; + Worklist.remove_if(IsInSet); + PostPromotionWorklist.remove_if(IsInSet); PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), PromotableAllocas.end(), - IsAllocaInSet(DeletedAllocas)), + IsInSet), PromotableAllocas.end()); DeletedAllocas.clear(); } @@ -3585,6 +3663,6 @@ bool SROA::runOnFunction(Function &F) { void SROA::getAnalysisUsage(AnalysisUsage &AU) const { if (RequiresDomTree) - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp index 9bcd702a9137..73c97ffeef4f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp @@ -22,38 +22,198 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sample-profile" - +#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/DebugInfo/DIContext.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/InstIterator.h" +#include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" +#include <cctype> using namespace llvm; +#define DEBUG_TYPE "sample-profile" + // Command line option to specify the file to read samples from. This is // mainly used for debugging. static cl::opt<std::string> SampleProfileFile( "sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); +static cl::opt<unsigned> SampleProfileMaxPropagateIterations( + "sample-profile-max-propagate-iterations", cl::init(100), + cl::desc("Maximum number of iterations to go through when propagating " + "sample block/edge weights through the CFG.")); + +namespace { +/// \brief Represents the relative location of an instruction. +/// +/// Instruction locations are specified by the line offset from the +/// beginning of the function (marked by the line where the function +/// header is) and the discriminator value within that line. +/// +/// The discriminator value is useful to distinguish instructions +/// that are on the same line but belong to different basic blocks +/// (e.g., the two post-increment instructions in "if (p) x++; else y++;"). +struct InstructionLocation { + InstructionLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {} + int LineOffset; + unsigned Discriminator; +}; +} + +namespace llvm { +template <> struct DenseMapInfo<InstructionLocation> { + typedef DenseMapInfo<int> OffsetInfo; + typedef DenseMapInfo<unsigned> DiscriminatorInfo; + static inline InstructionLocation getEmptyKey() { + return InstructionLocation(OffsetInfo::getEmptyKey(), + DiscriminatorInfo::getEmptyKey()); + } + static inline InstructionLocation getTombstoneKey() { + return InstructionLocation(OffsetInfo::getTombstoneKey(), + DiscriminatorInfo::getTombstoneKey()); + } + static inline unsigned getHashValue(InstructionLocation Val) { + return DenseMapInfo<std::pair<int, unsigned>>::getHashValue( + std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator)); + } + static inline bool isEqual(InstructionLocation LHS, InstructionLocation RHS) { + return LHS.LineOffset == RHS.LineOffset && + LHS.Discriminator == RHS.Discriminator; + } +}; +} namespace { +typedef DenseMap<InstructionLocation, unsigned> BodySampleMap; +typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; +typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; +typedef std::pair<BasicBlock *, BasicBlock *> Edge; +typedef DenseMap<Edge, unsigned> EdgeWeightMap; +typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; + +/// \brief Representation of the runtime profile for a function. +/// +/// This data structure contains the runtime profile for a given +/// function. It contains the total number of samples collected +/// in the function and a map of samples collected in every statement. +class SampleFunctionProfile { +public: + SampleFunctionProfile() + : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(nullptr), + PDT(nullptr), LI(nullptr), Ctx(nullptr) {} + + unsigned getFunctionLoc(Function &F); + bool emitAnnotations(Function &F, DominatorTree *DomTree, + PostDominatorTree *PostDomTree, LoopInfo *Loops); + unsigned getInstWeight(Instruction &I); + unsigned getBlockWeight(BasicBlock *B); + void addTotalSamples(unsigned Num) { TotalSamples += Num; } + void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; } + void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) { + assert(LineOffset >= 0); + BodySamples[InstructionLocation(LineOffset, Discriminator)] += Num; + } + void print(raw_ostream &OS); + void printEdgeWeight(raw_ostream &OS, Edge E); + void printBlockWeight(raw_ostream &OS, BasicBlock *BB); + void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); + bool computeBlockWeights(Function &F); + void findEquivalenceClasses(Function &F); + void findEquivalencesFor(BasicBlock *BB1, + SmallVector<BasicBlock *, 8> Descendants, + DominatorTreeBase<BasicBlock> *DomTree); + void propagateWeights(Function &F); + unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); + void buildEdges(Function &F); + bool propagateThroughEdges(Function &F); + bool empty() { return BodySamples.empty(); } + +protected: + /// \brief Total number of samples collected inside this function. + /// + /// Samples are cumulative, they include all the samples collected + /// inside this function and all its inlined callees. + unsigned TotalSamples; + + /// \brief Total number of samples collected at the head of the function. + /// FIXME: Use head samples to estimate a cold/hot attribute for the function. + unsigned TotalHeadSamples; + + /// \brief Line number for the function header. Used to compute relative + /// line numbers from the absolute line LOCs found in instruction locations. + /// The relative line numbers are needed to address the samples from the + /// profile file. + unsigned HeaderLineno; + + /// \brief Map line offsets to collected samples. + /// + /// Each entry in this map contains the number of samples + /// collected at the corresponding line offset. All line locations + /// are an offset from the start of the function. + BodySampleMap BodySamples; + + /// \brief Map basic blocks to their computed weights. + /// + /// The weight of a basic block is defined to be the maximum + /// of all the instruction weights in that block. + BlockWeightMap BlockWeights; + + /// \brief Map edges to their computed weights. + /// + /// Edge weights are computed by propagating basic block weights in + /// SampleProfile::propagateWeights. + EdgeWeightMap EdgeWeights; + + /// \brief Set of visited blocks during propagation. + SmallPtrSet<BasicBlock *, 128> VisitedBlocks; + + /// \brief Set of visited edges during propagation. + SmallSet<Edge, 128> VisitedEdges; + + /// \brief Equivalence classes for block weights. + /// + /// Two blocks BB1 and BB2 are in the same equivalence class if they + /// dominate and post-dominate each other, and they are in the same loop + /// nest. When this happens, the two blocks are guaranteed to execute + /// the same number of times. + EquivalenceClassMap EquivalenceClass; + + /// \brief Dominance, post-dominance and loop information. + DominatorTree *DT; + PostDominatorTree *PDT; + LoopInfo *LI; + + /// \brief Predecessors for each basic block in the CFG. + BlockEdgeMap Predecessors; + + /// \brief Successors for each basic block in the CFG. + BlockEdgeMap Successors; + + /// \brief LLVM context holding the debug data we need. + LLVMContext *Ctx; +}; + /// \brief Sample-based profile reader. /// /// Each profile contains sample counts for all the functions @@ -77,61 +237,33 @@ namespace { /// 2. The samples collected at each line in F. To provide some /// protection against source code shuffling, line numbers should /// be relative to the start of the function. -class SampleProfile { +class SampleModuleProfile { public: - SampleProfile(StringRef F) : Profiles(0), Filename(F) {} + SampleModuleProfile(const Module &M, StringRef F) + : Profiles(0), Filename(F), M(M) {} void dump(); - void loadText(); + bool loadText(); void loadNative() { llvm_unreachable("not implemented"); } - bool emitAnnotations(Function &F); void printFunctionProfile(raw_ostream &OS, StringRef FName); void dumpFunctionProfile(StringRef FName); + SampleFunctionProfile &getProfile(const Function &F) { + return Profiles[F.getName()]; + } -protected: - typedef DenseMap<uint32_t, uint32_t> BodySampleMap; - typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap; - - /// \brief Representation of the runtime profile for a function. - /// - /// This data structure contains the runtime profile for a given - /// function. It contains the total number of samples collected - /// in the function and a map of samples collected in every statement. - struct FunctionProfile { - /// \brief Total number of samples collected inside this function. - /// - /// Samples are cumulative, they include all the samples collected - /// inside this function and all its inlined callees. - unsigned TotalSamples; - - // \brief Total number of samples collected at the head of the function. - unsigned TotalHeadSamples; - - /// \brief Map line offsets to collected samples. - /// - /// Each entry in this map contains the number of samples - /// collected at the corresponding line offset. All line locations - /// are an offset from the start of the function. - BodySampleMap BodySamples; - - /// \brief Map basic blocks to their computed weights. - /// - /// The weight of a basic block is defined to be the maximum - /// of all the instruction weights in that block. - BlockWeightMap BlockWeights; - }; - - uint32_t getInstWeight(Instruction &I, unsigned FirstLineno, - BodySampleMap &BodySamples); - uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno, - BodySampleMap &BodySamples); + /// \brief Report a parse error message. + void reportParseError(int64_t LineNumber, Twine Msg) const { + DiagnosticInfoSampleProfile Diag(Filename.data(), LineNumber, Msg); + M.getContext().diagnose(Diag); + } +protected: /// \brief Map every function to its associated profile. /// /// The profile of every function executed at runtime is collected - /// in the structure FunctionProfile. This maps function objects + /// in the structure SampleFunctionProfile. This maps function objects /// to their corresponding profiles. - StringMap<FunctionProfile> Profiles; + StringMap<SampleFunctionProfile> Profiles; /// \brief Path name to the file holding the profile data. /// @@ -140,63 +272,10 @@ protected: /// version of the profile format to be used in constructing test /// cases and debugging. StringRef Filename; -}; -/// \brief Loader class for text-based profiles. -/// -/// This class defines a simple interface to read text files containing -/// profiles. It keeps track of line number information and location of -/// the file pointer. Users of this class are responsible for actually -/// parsing the lines returned by the readLine function. -/// -/// TODO - This does not really belong here. It is a generic text file -/// reader. It should be moved to the Support library and made more general. -class ExternalProfileTextLoader { -public: - ExternalProfileTextLoader(StringRef F) : Filename(F) { - error_code EC; - EC = MemoryBuffer::getFile(Filename, Buffer); - if (EC) - report_fatal_error("Could not open profile file " + Filename + ": " + - EC.message()); - FP = Buffer->getBufferStart(); - Lineno = 0; - } - - /// \brief Read a line from the mapped file. - StringRef readLine() { - size_t Length = 0; - const char *start = FP; - while (FP != Buffer->getBufferEnd() && *FP != '\n') { - Length++; - FP++; - } - if (FP != Buffer->getBufferEnd()) - FP++; - Lineno++; - return StringRef(start, Length); - } - - /// \brief Return true, if we've reached EOF. - bool atEOF() const { return FP == Buffer->getBufferEnd(); } - - /// \brief Report a parse error message and stop compilation. - void reportParseError(Twine Msg) const { - report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n"); - } - -private: - /// \brief Memory buffer holding the text file. - OwningPtr<MemoryBuffer> Buffer; - - /// \brief Current position into the memory buffer. - const char *FP; - - /// \brief Current line number. - int64_t Lineno; - - /// \brief Path name where to the profile file. - StringRef Filename; + /// \brief Module being compiled. Used mainly to access the current + /// LLVM context for diagnostics. + const Module &M; }; /// \brief Sample profile pass. @@ -210,148 +289,242 @@ public: static char ID; SampleProfileLoader(StringRef Name = SampleProfileFile) - : FunctionPass(ID), Profiler(0), Filename(Name) { + : FunctionPass(ID), Profiler(), Filename(Name), ProfileIsValid(false) { initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); } - virtual bool doInitialization(Module &M); + bool doInitialization(Module &M) override; void dump() { Profiler->dump(); } - virtual const char *getPassName() const { return "Sample profile pass"; } + const char *getPassName() const override { return "Sample profile pass"; } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<LoopInfo>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTree>(); } protected: /// \brief Profile reader object. - OwningPtr<SampleProfile> Profiler; + std::unique_ptr<SampleModuleProfile> Profiler; /// \brief Name of the profile file to load. StringRef Filename; + + /// \brief Flag indicating whether the profile input loaded successfully. + bool ProfileIsValid; }; } -/// \brief Print the function profile for \p FName on stream \p OS. +/// \brief Print this function profile on stream \p OS. /// /// \param OS Stream to emit the output to. -/// \param FName Name of the function to print. -void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) { - FunctionProfile FProfile = Profiles[FName]; - OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", " - << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size() +void SampleFunctionProfile::print(raw_ostream &OS) { + OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size() << " sampled lines\n"; - for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(), - SE = FProfile.BodySamples.end(); + for (BodySampleMap::const_iterator SI = BodySamples.begin(), + SE = BodySamples.end(); SI != SE; ++SI) - OS << "\tline offset: " << SI->first + OS << "\tline offset: " << SI->first.LineOffset + << ", discriminator: " << SI->first.Discriminator << ", number of samples: " << SI->second << "\n"; OS << "\n"; } +/// \brief Print the weight of edge \p E on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param E Edge to print. +void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) { + OS << "weight[" << E.first->getName() << "->" << E.second->getName() + << "]: " << EdgeWeights[E] << "\n"; +} + +/// \brief Print the equivalence class of block \p BB on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param BB Block to print. +void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS, + BasicBlock *BB) { + BasicBlock *Equiv = EquivalenceClass[BB]; + OS << "equivalence[" << BB->getName() + << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; +} + +/// \brief Print the weight of block \p BB on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param BB Block to print. +void SampleFunctionProfile::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { + OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; +} + +/// \brief Print the function profile for \p FName on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param FName Name of the function to print. +void SampleModuleProfile::printFunctionProfile(raw_ostream &OS, + StringRef FName) { + OS << "Function: " << FName << ":\n"; + Profiles[FName].print(OS); +} + /// \brief Dump the function profile for \p FName. /// /// \param FName Name of the function to print. -void SampleProfile::dumpFunctionProfile(StringRef FName) { +void SampleModuleProfile::dumpFunctionProfile(StringRef FName) { printFunctionProfile(dbgs(), FName); } /// \brief Dump all the function profiles found. -void SampleProfile::dump() { - for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(), - E = Profiles.end(); +void SampleModuleProfile::dump() { + for (StringMap<SampleFunctionProfile>::const_iterator I = Profiles.begin(), + E = Profiles.end(); I != E; ++I) dumpFunctionProfile(I->getKey()); } /// \brief Load samples from a text file. /// -/// The file is divided in two segments: -/// -/// Symbol table (represented with the string "symbol table") -/// Number of symbols in the table -/// symbol 1 -/// symbol 2 -/// ... -/// symbol N +/// The file contains a list of samples for every function executed at +/// runtime. Each function profile has the following format: /// -/// Function body profiles -/// function1:total_samples:total_head_samples:number_of_locations -/// location_offset_1: number_of_samples -/// location_offset_2: number_of_samples +/// function1:total_samples:total_head_samples +/// offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ] +/// offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ] /// ... -/// location_offset_N: number_of_samples +/// offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ] /// /// Function names must be mangled in order for the profile loader to -/// match them in the current translation unit. +/// match them in the current translation unit. The two numbers in the +/// function header specify how many total samples were accumulated in +/// the function (first number), and the total number of samples accumulated +/// at the prologue of the function (second number). This head sample +/// count provides an indicator of how frequent is the function invoked. +/// +/// Each sampled line may contain several items. Some are optional +/// (marked below): +/// +/// a- Source line offset. This number represents the line number +/// in the function where the sample was collected. The line number +/// is always relative to the line where symbol of the function +/// is defined. So, if the function has its header at line 280, +/// the offset 13 is at line 293 in the file. +/// +/// b- [OPTIONAL] Discriminator. This is used if the sampled program +/// was compiled with DWARF discriminator support +/// (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators) +/// +/// c- Number of samples. This is the number of samples collected by +/// the profiler at this source location. +/// +/// d- [OPTIONAL] Potential call targets and samples. If present, this +/// line contains a call instruction. This models both direct and +/// indirect calls. Each called target is listed together with the +/// number of samples. For example, +/// +/// 130: 7 foo:3 bar:2 baz:7 +/// +/// The above means that at relative line offset 130 there is a +/// call instruction that calls one of foo(), bar() and baz(). With +/// baz() being the relatively more frequent call target. +/// +/// FIXME: This is currently unhandled, but it has a lot of +/// potential for aiding the inliner. +/// /// /// Since this is a flat profile, a function that shows up more than /// once gets all its samples aggregated across all its instances. -/// TODO - flat profiles are too imprecise to provide good optimization -/// opportunities. Convert them to context-sensitive profile. +/// +/// FIXME: flat profiles are too imprecise to provide good optimization +/// opportunities. Convert them to context-sensitive profile. /// /// This textual representation is useful to generate unit tests and /// for debugging purposes, but it should not be used to generate /// profiles for large programs, as the representation is extremely /// inefficient. -void SampleProfile::loadText() { - ExternalProfileTextLoader Loader(Filename); - - // Read the symbol table. - StringRef Line = Loader.readLine(); - if (Line != "symbol table") - Loader.reportParseError("Expected 'symbol table', found " + Line); - int NumSymbols; - Line = Loader.readLine(); - if (Line.getAsInteger(10, NumSymbols)) - Loader.reportParseError("Expected a number, found " + Line); - for (int I = 0; I < NumSymbols; I++) { - StringRef FName = Loader.readLine(); - FunctionProfile &FProfile = Profiles[FName]; - FProfile.BodySamples.clear(); - FProfile.TotalSamples = 0; - FProfile.TotalHeadSamples = 0; +/// +/// \returns true if the file was loaded successfully, false otherwise. +bool SampleModuleProfile::loadText() { + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = + MemoryBuffer::getFile(Filename); + if (std::error_code EC = BufferOrErr.getError()) { + std::string Msg(EC.message()); + M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); + return false; } + std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get()); + line_iterator LineIt(*Buffer, '#'); // Read the profile of each function. Since each function may be // mentioned more than once, and we are collecting flat profiles, // accumulate samples as we parse them. - Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$"); - Regex LineSample("^([0-9]+): ([0-9]+)$"); - while (!Loader.atEOF()) { - SmallVector<StringRef, 4> Matches; - Line = Loader.readLine(); - if (!HeadRE.match(Line, &Matches)) - Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " + - Line); - assert(Matches.size() == 5); + Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$"); + Regex LineSample("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$"); + while (!LineIt.is_at_eof()) { + // Read the header of each function. + // + // Note that for function identifiers we are actually expecting + // mangled names, but we may not always get them. This happens when + // the compiler decides not to emit the function (e.g., it was inlined + // and removed). In this case, the binary will not have the linkage + // name for the function, so the profiler will emit the function's + // unmangled name, which may contain characters like ':' and '>' in its + // name (member functions, templates, etc). + // + // The only requirement we place on the identifier, then, is that it + // should not begin with a number. + SmallVector<StringRef, 3> Matches; + if (!HeadRE.match(*LineIt, &Matches)) { + reportParseError(LineIt.line_number(), + "Expected 'mangled_name:NUM:NUM', found " + *LineIt); + return false; + } + assert(Matches.size() == 4); StringRef FName = Matches[1]; - unsigned NumSamples, NumHeadSamples, NumSampledLines; + unsigned NumSamples, NumHeadSamples; Matches[2].getAsInteger(10, NumSamples); Matches[3].getAsInteger(10, NumHeadSamples); - Matches[4].getAsInteger(10, NumSampledLines); - FunctionProfile &FProfile = Profiles[FName]; - FProfile.TotalSamples += NumSamples; - FProfile.TotalHeadSamples += NumHeadSamples; - BodySampleMap &SampleMap = FProfile.BodySamples; - unsigned I; - for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) { - Line = Loader.readLine(); - if (!LineSample.match(Line, &Matches)) - Loader.reportParseError("Expected 'NUM: NUM', found " + Line); - assert(Matches.size() == 3); - unsigned LineOffset, NumSamples; + Profiles[FName] = SampleFunctionProfile(); + SampleFunctionProfile &FProfile = Profiles[FName]; + FProfile.addTotalSamples(NumSamples); + FProfile.addHeadSamples(NumHeadSamples); + ++LineIt; + + // Now read the body. The body of the function ends when we reach + // EOF or when we see the start of the next function. + while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) { + if (!LineSample.match(*LineIt, &Matches)) { + reportParseError( + LineIt.line_number(), + "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt); + return false; + } + assert(Matches.size() == 5); + unsigned LineOffset, NumSamples, Discriminator = 0; Matches[1].getAsInteger(10, LineOffset); - Matches[2].getAsInteger(10, NumSamples); - SampleMap[LineOffset] += NumSamples; - } + if (Matches[2] != "") + Matches[2].getAsInteger(10, Discriminator); + Matches[3].getAsInteger(10, NumSamples); - if (I < NumSampledLines) - Loader.reportParseError("Unexpected end of file"); + // FIXME: Handle called targets (in Matches[4]). + + // When dealing with instruction weights, we use the value + // zero to indicate the absence of a sample. If we read an + // actual zero from the profile file, return it as 1 to + // avoid the confusion later on. + if (NumSamples == 0) + NumSamples = 1; + FProfile.addBodySamples(LineOffset, Discriminator, NumSamples); + ++LineIt; + } } + + return true; } /// \brief Get the weight for an instruction. @@ -359,46 +532,49 @@ void SampleProfile::loadText() { /// The "weight" of an instruction \p Inst is the number of samples /// collected on that instruction at runtime. To retrieve it, we /// need to compute the line number of \p Inst relative to the start of its -/// function. We use \p FirstLineno to compute the offset. We then -/// look up the samples collected for \p Inst using \p BodySamples. +/// function. We use HeaderLineno to compute the offset. We then +/// look up the samples collected for \p Inst using BodySamples. /// /// \param Inst Instruction to query. -/// \param FirstLineno Line number of the first instruction in the function. -/// \param BodySamples Map of relative source line locations to samples. /// /// \returns The profiled weight of I. -uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno, - BodySampleMap &BodySamples) { - unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1; - return BodySamples.lookup(LOffset); +unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) { + DebugLoc DLoc = Inst.getDebugLoc(); + unsigned Lineno = DLoc.getLine(); + if (Lineno < HeaderLineno) + return 0; + + DILocation DIL(DLoc.getAsMDNode(*Ctx)); + int LOffset = Lineno - HeaderLineno; + unsigned Discriminator = DIL.getDiscriminator(); + unsigned Weight = + BodySamples.lookup(InstructionLocation(LOffset, Discriminator)); + DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst + << " (line offset: " << LOffset << "." << Discriminator + << " - weight: " << Weight << ")\n"); + return Weight; } /// \brief Compute the weight of a basic block. /// /// The weight of basic block \p B is the maximum weight of all the -/// instructions in B. +/// instructions in B. The weight of \p B is computed and cached in +/// the BlockWeights map. /// /// \param B The basic block to query. -/// \param FirstLineno The line number for the first line in the -/// function holding B. -/// \param BodySamples The map containing all the samples collected in that -/// function. /// /// \returns The computed weight of B. -uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, - BodySampleMap &BodySamples) { +unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) { // If we've computed B's weight before, return it. - Function *F = B->getParent(); - FunctionProfile &FProfile = Profiles[F->getName()]; std::pair<BlockWeightMap::iterator, bool> Entry = - FProfile.BlockWeights.insert(std::make_pair(B, 0)); + BlockWeights.insert(std::make_pair(B, 0)); if (!Entry.second) return Entry.first->second; // Otherwise, compute and cache B's weight. - uint32_t Weight = 0; + unsigned Weight = 0; for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { - uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples); + unsigned InstWeight = getInstWeight(*I); if (InstWeight > Weight) Weight = InstWeight; } @@ -406,31 +582,344 @@ uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, return Weight; } -/// \brief Generate branch weight metadata for all branches in \p F. +/// \brief Compute and store the weights of every basic block. +/// +/// This populates the BlockWeights map by computing +/// the weights of every basic block in the CFG. +/// +/// \param F The function to query. +bool SampleFunctionProfile::computeBlockWeights(Function &F) { + bool Changed = false; + DEBUG(dbgs() << "Block weights\n"); + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { + unsigned Weight = getBlockWeight(B); + Changed |= (Weight > 0); + DEBUG(printBlockWeight(dbgs(), B)); + } + + return Changed; +} + +/// \brief Find equivalence classes for the given block. /// -/// For every branch instruction B in \p F, we compute the weight of the -/// target block for each of the edges out of B. This is the weight -/// that we associate with that branch. +/// This finds all the blocks that are guaranteed to execute the same +/// number of times as \p BB1. To do this, it traverses all the the +/// descendants of \p BB1 in the dominator or post-dominator tree. /// -/// TODO - This weight assignment will most likely be wrong if the -/// target branch has more than two predecessors. This needs to be done -/// using some form of flow propagation. +/// A block BB2 will be in the same equivalence class as \p BB1 if +/// the following holds: /// -/// Once all the branch weights are computed, we emit the MD_prof -/// metadata on B using the computed values. +/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 +/// is a descendant of \p BB1 in the dominator tree, then BB2 should +/// dominate BB1 in the post-dominator tree. +/// +/// 2- Both BB2 and \p BB1 must be in the same loop. +/// +/// For every block BB2 that meets those two requirements, we set BB2's +/// equivalence class to \p BB1. +/// +/// \param BB1 Block to check. +/// \param Descendants Descendants of \p BB1 in either the dom or pdom tree. +/// \param DomTree Opposite dominator tree. If \p Descendants is filled +/// with blocks from \p BB1's dominator tree, then +/// this is the post-dominator tree, and vice versa. +void SampleFunctionProfile::findEquivalencesFor( + BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, + DominatorTreeBase<BasicBlock> *DomTree) { + for (SmallVectorImpl<BasicBlock *>::iterator I = Descendants.begin(), + E = Descendants.end(); + I != E; ++I) { + BasicBlock *BB2 = *I; + bool IsDomParent = DomTree->dominates(BB2, BB1); + bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); + if (BB1 != BB2 && VisitedBlocks.insert(BB2) && IsDomParent && + IsInSameLoop) { + EquivalenceClass[BB2] = BB1; + + // If BB2 is heavier than BB1, make BB2 have the same weight + // as BB1. + // + // Note that we don't worry about the opposite situation here + // (when BB2 is lighter than BB1). We will deal with this + // during the propagation phase. Right now, we just want to + // make sure that BB1 has the largest weight of all the + // members of its equivalence set. + unsigned &BB1Weight = BlockWeights[BB1]; + unsigned &BB2Weight = BlockWeights[BB2]; + BB1Weight = std::max(BB1Weight, BB2Weight); + } + } +} + +/// \brief Find equivalence classes. +/// +/// Since samples may be missing from blocks, we can fill in the gaps by setting +/// the weights of all the blocks in the same equivalence class to the same +/// weight. To compute the concept of equivalence, we use dominance and loop +/// information. Two blocks B1 and B2 are in the same equivalence class if B1 +/// dominates B2, B2 post-dominates B1 and both are in the same loop. /// /// \param F The function to query. -bool SampleProfile::emitAnnotations(Function &F) { +void SampleFunctionProfile::findEquivalenceClasses(Function &F) { + SmallVector<BasicBlock *, 8> DominatedBBs; + DEBUG(dbgs() << "\nBlock equivalence classes\n"); + // Find equivalence sets based on dominance and post-dominance information. + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { + BasicBlock *BB1 = B; + + // Compute BB1's equivalence class once. + if (EquivalenceClass.count(BB1)) { + DEBUG(printBlockEquivalence(dbgs(), BB1)); + continue; + } + + // By default, blocks are in their own equivalence class. + EquivalenceClass[BB1] = BB1; + + // Traverse all the blocks dominated by BB1. We are looking for + // every basic block BB2 such that: + // + // 1- BB1 dominates BB2. + // 2- BB2 post-dominates BB1. + // 3- BB1 and BB2 are in the same loop nest. + // + // If all those conditions hold, it means that BB2 is executed + // as many times as BB1, so they are placed in the same equivalence + // class by making BB2's equivalence class be BB1. + DominatedBBs.clear(); + DT->getDescendants(BB1, DominatedBBs); + findEquivalencesFor(BB1, DominatedBBs, PDT->DT); + + // Repeat the same logic for all the blocks post-dominated by BB1. + // We are looking for every basic block BB2 such that: + // + // 1- BB1 post-dominates BB2. + // 2- BB2 dominates BB1. + // 3- BB1 and BB2 are in the same loop nest. + // + // If all those conditions hold, BB2's equivalence class is BB1. + DominatedBBs.clear(); + PDT->getDescendants(BB1, DominatedBBs); + findEquivalencesFor(BB1, DominatedBBs, DT); + + DEBUG(printBlockEquivalence(dbgs(), BB1)); + } + + // Assign weights to equivalence classes. + // + // All the basic blocks in the same equivalence class will execute + // the same number of times. Since we know that the head block in + // each equivalence class has the largest weight, assign that weight + // to all the blocks in that equivalence class. + DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { + BasicBlock *BB = B; + BasicBlock *EquivBB = EquivalenceClass[BB]; + if (BB != EquivBB) + BlockWeights[BB] = BlockWeights[EquivBB]; + DEBUG(printBlockWeight(dbgs(), BB)); + } +} + +/// \brief Visit the given edge to decide if it has a valid weight. +/// +/// If \p E has not been visited before, we copy to \p UnknownEdge +/// and increment the count of unknown edges. +/// +/// \param E Edge to visit. +/// \param NumUnknownEdges Current number of unknown edges. +/// \param UnknownEdge Set if E has not been visited before. +/// +/// \returns E's weight, if known. Otherwise, return 0. +unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges, + Edge *UnknownEdge) { + if (!VisitedEdges.count(E)) { + (*NumUnknownEdges)++; + *UnknownEdge = E; + return 0; + } + + return EdgeWeights[E]; +} + +/// \brief Propagate weights through incoming/outgoing edges. +/// +/// If the weight of a basic block is known, and there is only one edge +/// with an unknown weight, we can calculate the weight of that edge. +/// +/// Similarly, if all the edges have a known count, we can calculate the +/// count of the basic block, if needed. +/// +/// \param F Function to process. +/// +/// \returns True if new weights were assigned to edges or blocks. +bool SampleFunctionProfile::propagateThroughEdges(Function &F) { bool Changed = false; - FunctionProfile &FProfile = Profiles[F.getName()]; - unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine(); - MDBuilder MDB(F.getContext()); + DEBUG(dbgs() << "\nPropagation through edges\n"); + for (Function::iterator BI = F.begin(), EI = F.end(); BI != EI; ++BI) { + BasicBlock *BB = BI; + + // Visit all the predecessor and successor edges to determine + // which ones have a weight assigned already. Note that it doesn't + // matter that we only keep track of a single unknown edge. The + // only case we are interested in handling is when only a single + // edge is unknown (see setEdgeOrBlockWeight). + for (unsigned i = 0; i < 2; i++) { + unsigned TotalWeight = 0; + unsigned NumUnknownEdges = 0; + Edge UnknownEdge, SelfReferentialEdge; + + if (i == 0) { + // First, visit all predecessor edges. + for (size_t I = 0; I < Predecessors[BB].size(); I++) { + Edge E = std::make_pair(Predecessors[BB][I], BB); + TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); + if (E.first == E.second) + SelfReferentialEdge = E; + } + } else { + // On the second round, visit all successor edges. + for (size_t I = 0; I < Successors[BB].size(); I++) { + Edge E = std::make_pair(BB, Successors[BB][I]); + TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); + } + } + + // After visiting all the edges, there are three cases that we + // can handle immediately: + // + // - All the edge weights are known (i.e., NumUnknownEdges == 0). + // In this case, we simply check that the sum of all the edges + // is the same as BB's weight. If not, we change BB's weight + // to match. Additionally, if BB had not been visited before, + // we mark it visited. + // + // - Only one edge is unknown and BB has already been visited. + // In this case, we can compute the weight of the edge by + // subtracting the total block weight from all the known + // edge weights. If the edges weight more than BB, then the + // edge of the last remaining edge is set to zero. + // + // - There exists a self-referential edge and the weight of BB is + // known. In this case, this edge can be based on BB's weight. + // We add up all the other known edges and set the weight on + // the self-referential edge as we did in the previous case. + // + // In any other case, we must continue iterating. Eventually, + // all edges will get a weight, or iteration will stop when + // it reaches SampleProfileMaxPropagateIterations. + if (NumUnknownEdges <= 1) { + unsigned &BBWeight = BlockWeights[BB]; + if (NumUnknownEdges == 0) { + // If we already know the weight of all edges, the weight of the + // basic block can be computed. It should be no larger than the sum + // of all edge weights. + if (TotalWeight > BBWeight) { + BBWeight = TotalWeight; + Changed = true; + DEBUG(dbgs() << "All edge weights for " << BB->getName() + << " known. Set weight for block: "; + printBlockWeight(dbgs(), BB);); + } + if (VisitedBlocks.insert(BB)) + Changed = true; + } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { + // If there is a single unknown edge and the block has been + // visited, then we can compute E's weight. + if (BBWeight >= TotalWeight) + EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; + else + EdgeWeights[UnknownEdge] = 0; + VisitedEdges.insert(UnknownEdge); + Changed = true; + DEBUG(dbgs() << "Set weight for edge: "; + printEdgeWeight(dbgs(), UnknownEdge)); + } + } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) { + unsigned &BBWeight = BlockWeights[BB]; + // We have a self-referential edge and the weight of BB is known. + if (BBWeight >= TotalWeight) + EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; + else + EdgeWeights[SelfReferentialEdge] = 0; + VisitedEdges.insert(SelfReferentialEdge); + Changed = true; + DEBUG(dbgs() << "Set self-referential edge weight to: "; + printEdgeWeight(dbgs(), SelfReferentialEdge)); + } + } + } + + return Changed; +} + +/// \brief Build in/out edge lists for each basic block in the CFG. +/// +/// We are interested in unique edges. If a block B1 has multiple +/// edges to another block B2, we only add a single B1->B2 edge. +void SampleFunctionProfile::buildEdges(Function &F) { + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *B1 = I; + + // Add predecessors for B1. + SmallPtrSet<BasicBlock *, 16> Visited; + if (!Predecessors[B1].empty()) + llvm_unreachable("Found a stale predecessors list in a basic block."); + for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) { + BasicBlock *B2 = *PI; + if (Visited.insert(B2)) + Predecessors[B1].push_back(B2); + } + + // Add successors for B1. + Visited.clear(); + if (!Successors[B1].empty()) + llvm_unreachable("Found a stale successors list in a basic block."); + for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) { + BasicBlock *B2 = *SI; + if (Visited.insert(B2)) + Successors[B1].push_back(B2); + } + } +} - // Clear the block weights cache. - FProfile.BlockWeights.clear(); +/// \brief Propagate weights into edges +/// +/// The following rules are applied to every block B in the CFG: +/// +/// - If B has a single predecessor/successor, then the weight +/// of that edge is the weight of the block. +/// +/// - If all incoming or outgoing edges are known except one, and the +/// weight of the block is already known, the weight of the unknown +/// edge will be the weight of the block minus the sum of all the known +/// edges. If the sum of all the known edges is larger than B's weight, +/// we set the unknown edge weight to zero. +/// +/// - If there is a self-referential edge, and the weight of the block is +/// known, the weight for that edge is set to the weight of the block +/// minus the weight of the other incoming edges to that block (if +/// known). +void SampleFunctionProfile::propagateWeights(Function &F) { + bool Changed = true; + unsigned i = 0; + + // Before propagation starts, build, for each block, a list of + // unique predecessors and successors. This is necessary to handle + // identical edges in multiway branches. Since we visit all blocks and all + // edges of the CFG, it is cleaner to build these lists once at the start + // of the pass. + buildEdges(F); + + // Propagate until we converge or we go past the iteration limit. + while (Changed && i++ < SampleProfileMaxPropagateIterations) { + Changed = propagateThroughEdges(F); + } - // When we find a branch instruction: For each edge E out of the branch, - // the weight of E is the weight of the target block. + // Generate MD_prof metadata for every branch instruction using the + // edge weights computed during propagation. + DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); + MDBuilder MDB(F.getContext()); for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { BasicBlock *B = I; TerminatorInst *TI = B->getTerminator(); @@ -439,34 +928,155 @@ bool SampleProfile::emitAnnotations(Function &F) { if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) continue; - SmallVector<uint32_t, 4> Weights; - unsigned NSuccs = TI->getNumSuccessors(); - for (unsigned I = 0; I < NSuccs; ++I) { + DEBUG(dbgs() << "\nGetting weights for branch at line " + << TI->getDebugLoc().getLine() << ".\n"); + SmallVector<unsigned, 4> Weights; + bool AllWeightsZero = true; + for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); - uint32_t Weight = - computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples); + Edge E = std::make_pair(B, Succ); + unsigned Weight = EdgeWeights[E]; + DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); Weights.push_back(Weight); + if (Weight != 0) + AllWeightsZero = false; } - TI->setMetadata(llvm::LLVMContext::MD_prof, - MDB.createBranchWeights(Weights)); - Changed = true; + // Only set weights if there is at least one non-zero weight. + // In any other case, let the analyzer set weights. + if (!AllWeightsZero) { + DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + } else { + DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); + } } +} - return Changed; +/// \brief Get the line number for the function header. +/// +/// This looks up function \p F in the current compilation unit and +/// retrieves the line number where the function is defined. This is +/// line 0 for all the samples read from the profile file. Every line +/// number is relative to this line. +/// +/// \param F Function object to query. +/// +/// \returns the line number where \p F is defined. If it returns 0, +/// it means that there is no debug information available for \p F. +unsigned SampleFunctionProfile::getFunctionLoc(Function &F) { + NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); + if (CUNodes) { + for (unsigned I = 0, E1 = CUNodes->getNumOperands(); I != E1; ++I) { + DICompileUnit CU(CUNodes->getOperand(I)); + DIArray Subprograms = CU.getSubprograms(); + for (unsigned J = 0, E2 = Subprograms.getNumElements(); J != E2; ++J) { + DISubprogram Subprogram(Subprograms.getElement(J)); + if (Subprogram.describes(&F)) + return Subprogram.getLineNumber(); + } + } + } + + F.getContext().diagnose(DiagnosticInfoSampleProfile( + "No debug information found in function " + F.getName())); + return 0; } -char SampleProfileLoader::ID = 0; -INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader", - false, false) +/// \brief Generate branch weight metadata for all branches in \p F. +/// +/// Branch weights are computed out of instruction samples using a +/// propagation heuristic. Propagation proceeds in 3 phases: +/// +/// 1- Assignment of block weights. All the basic blocks in the function +/// are initial assigned the same weight as their most frequently +/// executed instruction. +/// +/// 2- Creation of equivalence classes. Since samples may be missing from +/// blocks, we can fill in the gaps by setting the weights of all the +/// blocks in the same equivalence class to the same weight. To compute +/// the concept of equivalence, we use dominance and loop information. +/// Two blocks B1 and B2 are in the same equivalence class if B1 +/// dominates B2, B2 post-dominates B1 and both are in the same loop. +/// +/// 3- Propagation of block weights into edges. This uses a simple +/// propagation heuristic. The following rules are applied to every +/// block B in the CFG: +/// +/// - If B has a single predecessor/successor, then the weight +/// of that edge is the weight of the block. +/// +/// - If all the edges are known except one, and the weight of the +/// block is already known, the weight of the unknown edge will +/// be the weight of the block minus the sum of all the known +/// edges. If the sum of all the known edges is larger than B's weight, +/// we set the unknown edge weight to zero. +/// +/// - If there is a self-referential edge, and the weight of the block is +/// known, the weight for that edge is set to the weight of the block +/// minus the weight of the other incoming edges to that block (if +/// known). +/// +/// Since this propagation is not guaranteed to finalize for every CFG, we +/// only allow it to proceed for a limited number of iterations (controlled +/// by -sample-profile-max-propagate-iterations). +/// +/// FIXME: Try to replace this propagation heuristic with a scheme +/// that is guaranteed to finalize. A work-list approach similar to +/// the standard value propagation algorithm used by SSA-CCP might +/// work here. +/// +/// Once all the branch weights are computed, we emit the MD_prof +/// metadata on B using the computed values for each of its branches. +/// +/// \param F The function to query. +/// +/// \returns true if \p F was modified. Returns false, otherwise. +bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree, + PostDominatorTree *PostDomTree, + LoopInfo *Loops) { + bool Changed = false; -bool SampleProfileLoader::runOnFunction(Function &F) { - return Profiler->emitAnnotations(F); + // Initialize invariants used during computation and propagation. + HeaderLineno = getFunctionLoc(F); + if (HeaderLineno == 0) + return false; + + DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() + << ": " << HeaderLineno << "\n"); + DT = DomTree; + PDT = PostDomTree; + LI = Loops; + Ctx = &F.getParent()->getContext(); + + // Compute basic block weights. + Changed |= computeBlockWeights(F); + + if (Changed) { + // Find equivalence classes. + findEquivalenceClasses(F); + + // Propagate weights to all edges. + propagateWeights(F); + } + + return Changed; } +char SampleProfileLoader::ID = 0; +INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", + "Sample Profile loader", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) +INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", + "Sample Profile loader", false, false) + bool SampleProfileLoader::doInitialization(Module &M) { - Profiler.reset(new SampleProfile(Filename)); - Profiler->loadText(); + Profiler.reset(new SampleModuleProfile(M, Filename)); + ProfileIsValid = Profiler->loadText(); return true; } @@ -477,3 +1087,15 @@ FunctionPass *llvm::createSampleProfileLoaderPass() { FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { return new SampleProfileLoader(Name); } + +bool SampleProfileLoader::runOnFunction(Function &F) { + if (!ProfileIsValid) + return false; + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>(); + LoopInfo *LI = &getAnalysis<LoopInfo>(); + SampleFunctionProfile &FunctionProfile = Profiler->getProfile(F); + if (!FunctionProfile.empty()) + return FunctionProfile.emitAnnotations(F, DT, PDT, LI); + return false; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 857597e47462..de724d419a48 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -17,8 +17,8 @@ #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Scalar.h" #include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/Verifier.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/PassManager.h" @@ -29,11 +29,12 @@ using namespace llvm; void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); initializeSampleProfileLoaderPass(Registry); - initializeCodeGenPreparePass(Registry); + initializeConstantHoistingPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); initializeDCEPass(Registry); initializeDeadInstEliminationPass(Registry); + initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); initializeEarlyCSEPass(Registry); @@ -51,6 +52,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerAtomicPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); + initializeMergedLoadStoreMotionPass(Registry); initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); @@ -63,6 +65,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeStructurizeCFGPass(Registry); initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); + initializeSeparateConstOffsetFromGEPPass(Registry); + initializeLoadCombinePass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -81,10 +85,18 @@ void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createDeadStoreEliminationPass()); } +void LLVMAddScalarizerPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScalarizerPass()); +} + void LLVMAddGVNPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createGVNPass()); } +void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMergedLoadStoreMotionPass()); +} + void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createIndVarSimplifyPass()); } @@ -176,6 +188,7 @@ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { void LLVMAddVerifierPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createVerifierPass()); + // FIXME: should this also add createDebugInfoVerifierPass()? } void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 57b290e14b13..e2a24a7fd4a7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -19,20 +19,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "scalarrepl" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -41,10 +42,8 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" @@ -52,6 +51,8 @@ #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; +#define DEBUG_TYPE "scalarrepl" + STATISTIC(NumReplaced, "Number of allocas broken up"); STATISTIC(NumPromoted, "Number of allocas promoted"); STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); @@ -80,14 +81,14 @@ namespace { ScalarLoadThreshold = SLT; } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; bool performScalarRepl(Function &F); bool performPromotion(Function &F); private: bool HasDomTree; - DataLayout *TD; + const DataLayout *DL; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -195,8 +196,8 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } }; @@ -212,7 +213,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } }; @@ -224,7 +225,7 @@ char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) @@ -258,7 +259,7 @@ namespace { class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; - const DataLayout &TD; + const DataLayout &DL; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object @@ -301,10 +302,10 @@ class ConvertToScalarInfo { bool HadDynamicAccess; public: - explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td, + explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL, unsigned SLT) - : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), - ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), + : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false), + ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false), HadDynamicAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); @@ -332,8 +333,8 @@ private: AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // If we can't convert this scalar, or if mem2reg can trivially do it, bail // out. - if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial) - return 0; + if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial) + return nullptr; // If an alloca has only memset / memcpy uses, it may still have an Unknown // ScalarKind. Treat it as an Integer below. @@ -361,23 +362,24 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // Do not convert to scalar integer if the alloca size exceeds the // scalar load threshold. if (BitWidth > ScalarLoadThreshold) - return 0; + return nullptr; if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && - !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth)) - return 0; + !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth)) + return nullptr; // Dynamic accesses on integers aren't yet supported. They need us to shift // by a dynamic amount which could be difficult to work out as we might not // know whether to use a left or right shift. if (ScalarKind == Integer && HadDynamicAccess) - return 0; + return nullptr; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } - AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0, 0); + AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", + AI->getParent()->begin()); + ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } @@ -466,10 +468,10 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, /// SawVec flag. bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (User *U : V->users()) { + Instruction *UI = cast<Instruction>(U); - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { // Don't break volatile loads. if (!LI->isSimple()) return false; @@ -481,7 +483,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, continue; } - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { // Storing the pointer, not into the value? if (SI->getOperand(0) == V || !SI->isSimple()) return false; // Don't touch MMX operations. @@ -492,7 +494,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, continue; } - if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) { if (!onlyUsedByLifetimeMarkers(BCI)) IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) @@ -500,7 +502,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, continue; } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) { // If this is a GEP with a variable indices, we can't handle it. PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType()); if (!PtrTy) @@ -508,7 +510,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - Value *GEPNonConstantIdx = 0; + Value *GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { if (!isa<VectorType>(PtrTy->getElementType())) return false; @@ -520,7 +522,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, HadDynamicAccess = true; } else GEPNonConstantIdx = NonConstantIdx; - uint64_t GEPOffset = TD.getIndexedOffset(PtrTy, + uint64_t GEPOffset = DL.getIndexedOffset(PtrTy, Indices); // See if all uses can be converted. if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) @@ -532,7 +534,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, // If this is a constant sized memset of a constant value (e.g. 0) we can // handle it. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; @@ -559,12 +561,12 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); - if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) + if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0) return false; IsNotTrivial = true; // Can't be mem2reg'd. @@ -572,7 +574,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, } // If this is a lifetime intrinsic, we can handle it. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { continue; @@ -597,7 +599,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, Value* NonConstantIdx) { while (!Ptr->use_empty()) { - Instruction *User = cast<Instruction>(Ptr->use_back()); + Instruction *User = cast<Instruction>(Ptr->user_back()); if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); @@ -608,14 +610,14 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - Value* GEPNonConstantIdx = 0; + Value* GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { assert(!NonConstantIdx && "Dynamic GEP reading from dynamic GEP unsupported"); GEPNonConstantIdx = Indices.pop_back_val(); } else GEPNonConstantIdx = NonConstantIdx; - uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(), Indices); ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); GEP->eraseFromParent(); @@ -671,7 +673,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), - Old, Offset, 0, Builder); + Old, Offset, nullptr, Builder); Builder.CreateStore(New, NewAI); // If the load we just inserted is now dead, then the memset overwrote @@ -692,9 +694,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0)); + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &DL, 0)); - if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) { + if (GetUnderlyingObject(MTI->getSource(), &DL, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); @@ -710,7 +712,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); - } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) { + } else if (GetUnderlyingObject(MTI->getDest(), &DL, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); @@ -770,15 +772,15 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. if (VectorType *VTy = dyn_cast<VectorType>(FromType)) { - unsigned FromTypeSize = TD.getTypeAllocSize(FromType); - unsigned ToTypeSize = TD.getTypeAllocSize(ToType); + unsigned FromTypeSize = DL.getTypeAllocSize(FromType); + unsigned ToTypeSize = DL.getTypeAllocSize(ToType); if (FromTypeSize == ToTypeSize) return Builder.CreateBitCast(FromVal, ToType); // Otherwise it must be an element access. unsigned Elt = 0; if (Offset) { - unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); Elt = Offset/EltSize; assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } @@ -804,12 +806,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, if (StructType *ST = dyn_cast<StructType>(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); - const StructLayout &Layout = *TD.getStructLayout(ST); + const StructLayout &Layout = *DL.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), - 0, Builder); + nullptr, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; @@ -818,11 +820,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); - uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, 0, Builder); + Offset+i*EltSize, nullptr, + Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; @@ -834,12 +837,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // If this is a big-endian system and the load is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; - if (TD.isBigEndian()) { + if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. - ShAmt = TD.getTypeStoreSizeInBits(NTy) - - TD.getTypeStoreSizeInBits(ToType) - Offset; + ShAmt = DL.getTypeStoreSizeInBits(NTy) - + DL.getTypeStoreSizeInBits(ToType) - Offset; } else { ShAmt = Offset; } @@ -855,7 +858,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, ConstantInt::get(FromVal->getType(), -ShAmt)); // Finally, unconditionally truncate the integer to the right width. - unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); + unsigned LIBitWidth = DL.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), @@ -902,8 +905,8 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, LLVMContext &Context = Old->getContext(); if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { - uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); - uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); + uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy); + uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType()); // Changing the whole vector with memset or with an access of a different // vector type? @@ -914,7 +917,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, Type *EltTy = VTy->getElementType(); if (SV->getType() != EltTy) SV = Builder.CreateBitCast(SV, EltTy); - uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy); + uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy); unsigned Elt = Offset/EltSize; Value *Idx; if (NonConstantIdx) { @@ -933,12 +936,12 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (StructType *ST = dyn_cast<StructType>(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); - const StructLayout &Layout = *TD.getStructLayout(ST); + const StructLayout &Layout = *DL.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), - 0, Builder); + nullptr, Builder); } return Old; } @@ -946,24 +949,25 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); - uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr, + Builder); } return Old; } // If SV is a float, convert it to the appropriate integer type. // If it is a pointer, do the same. - unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType()); - unsigned DestWidth = TD.getTypeSizeInBits(AllocaType); - unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); - unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); + unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType()); + unsigned DestWidth = DL.getTypeSizeInBits(AllocaType); + unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType()); + unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType); if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType())); + SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { @@ -982,7 +986,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // If this is a big-endian system and the store is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; - if (TD.isBigEndian()) { + if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. @@ -1020,7 +1024,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, bool SROA::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<DataLayout>(); + if (skipOptnoneFunction(F)) + return false; + + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; bool Changed = performPromotion(F); @@ -1028,7 +1036,7 @@ bool SROA::runOnFunction(Function &F) { // theoretically needs to. It should be refactored in order to support // target-independent IR. Until this is done, just skip the actual // scalar-replacement portion of this pass. - if (!TD) return Changed; + if (!DL) return Changed; while (1) { bool LocalChange = performScalarRepl(F); @@ -1050,17 +1058,16 @@ class AllocaPromoter : public LoadAndStorePromoter { public: AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, DIBuilder *DB) - : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {} + : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {} void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) { - for (Value::use_iterator UI = DebugNode->use_begin(), - E = DebugNode->use_end(); UI != E; ++UI) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) DVIs.push_back(DVI); } @@ -1078,14 +1085,14 @@ public: } } - virtual bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const { + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const override { if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getOperand(0) == AI; return cast<StoreInst>(I)->getPointerOperand() == AI; } - virtual void updateDebugInfo(Instruction *Inst) const { + void updateDebugInfo(Instruction *Inst) const override { for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; @@ -1097,7 +1104,7 @@ public: for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; - Value *Arg = NULL; + Value *Arg = nullptr; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. @@ -1134,22 +1141,21 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { - bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); - bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); +static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL); - for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; + for (User *U : SI->users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, - LI->getAlignment(), TD)) + LI->getAlignment(), DL)) return false; if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, - LI->getAlignment(), TD)) + LI->getAlignment(), DL)) return false; } @@ -1172,17 +1178,16 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { +static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN->getParent(); unsigned MaxAlign = 0; - for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; + for (User *U : PN->users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. @@ -1221,8 +1226,8 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. - if (InVal->isDereferenceablePointer() || - isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) + if (InVal->isDereferenceablePointer(DL) || + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL)) continue; return false; @@ -1236,13 +1241,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; - - for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE; ++UI) { - User *U = *UI; + for (User *U : AI->users()) { if (LoadInst *LI = dyn_cast<LoadInst>(U)) { if (!LI->isSimple()) return false; @@ -1265,12 +1267,12 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // This is very rare and we just scrambled the use list of AI, start // over completely. - return tryToMakeAllocaBePromotable(AI, TD); + return tryToMakeAllocaBePromotable(AI, DL); } // If it is safe to turn "load (select c, AI, ptr)" into a select of two // loads, then we can transform this by rewriting the select. - if (!isSafeSelectToSpeculate(SI, TD)) + if (!isSafeSelectToSpeculate(SI, DL)) return false; InstsToRewrite.insert(SI); @@ -1285,7 +1287,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. - if (!isSafePHIToSpeculate(PN, TD)) + if (!isSafePHIToSpeculate(PN, DL)) return false; InstsToRewrite.insert(PN); @@ -1312,12 +1314,9 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) { // This could only be a bitcast used by nothing but lifetime intrinsics. - for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end(); - I != E;) { - Use &U = I.getUse(); - ++I; - cast<Instruction>(U.getUser())->eraseFromParent(); - } + for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end(); + I != E;) + cast<Instruction>(*I++)->eraseFromParent(); BCI->eraseFromParent(); continue; } @@ -1326,7 +1325,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // Selects in InstsToRewrite only have load uses. Rewrite each as two // loads with a new select. while (!SI->use_empty()) { - LoadInst *LI = cast<LoadInst>(SI->use_back()); + LoadInst *LI = cast<LoadInst>(SI->user_back()); IRBuilder<> Builder(LI); LoadInst *TrueLoad = @@ -1367,13 +1366,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // Get the TBAA tag and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ, it doesn't matter. - LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); + LoadInst *SomeLoad = cast<LoadInst>(PN->user_back()); MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN->use_empty()) { - LoadInst *LI = cast<LoadInst>(PN->use_back()); + LoadInst *LI = cast<LoadInst>(PN->user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } @@ -1385,7 +1384,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); LoadInst *&Load = InsertedLoads[Pred]; - if (Load == 0) { + if (!Load) { Load = new LoadInst(PN->getIncomingValue(i), PN->getName() + "." + Pred->getName(), Pred->getTerminator()); @@ -1405,9 +1404,9 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { bool SROA::performPromotion(Function &F) { std::vector<AllocaInst*> Allocas; - DominatorTree *DT = 0; + DominatorTree *DT = nullptr; if (HasDomTree) - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function DIBuilder DIB(*F.getParent()); @@ -1420,7 +1419,7 @@ bool SROA::performPromotion(Function &F) { // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? - if (tryToMakeAllocaBePromotable(AI, TD)) + if (tryToMakeAllocaBePromotable(AI, DL)) Allocas.push_back(AI); if (Allocas.empty()) break; @@ -1433,9 +1432,8 @@ bool SROA::performPromotion(Function &F) { AllocaInst *AI = Allocas[i]; // Build list of instructions to promote. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E; ++UI) - Insts.push_back(cast<Instruction>(*UI)); + for (User *U : AI->users()) + Insts.push_back(cast<Instruction>(U)); AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts); Insts.clear(); } @@ -1496,7 +1494,7 @@ bool SROA::performScalarRepl(Function &F) { // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar // value cannot be decomposed at all. - uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType()); + uint64_t AllocaSize = DL->getTypeAllocSize(AI->getAllocatedType()); // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; @@ -1520,7 +1518,7 @@ bool SROA::performScalarRepl(Function &F) { // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. if (AllocaInst *NewAI = ConvertToScalarInfo( - (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) { + (unsigned)AllocaSize, *DL, ScalarLoadThreshold).TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; @@ -1543,7 +1541,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI, if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { ElementAllocas.reserve(ST->getNumContainedTypes()); for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, + AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); @@ -1554,7 +1552,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI, ElementAllocas.reserve(AT->getNumElements()); Type *ElTy = AT->getElementType(); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(), + AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); WorkList.push_back(NA); // Add to worklist for recursive processing @@ -1583,7 +1581,7 @@ void SROA::DeleteDeadInstructions() { // Zero out the operand and see if it becomes trivially dead. // (But, don't add allocas to the dead instruction list -- they are // already on the worklist and will be deleted separately.) - *OI = 0; + *OI = nullptr; if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U)) DeadInsts.push_back(U); } @@ -1598,8 +1596,8 @@ void SROA::DeleteDeadInstructions() { /// referenced by this instruction. void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { isSafeForScalarRepl(BC, Offset, Info); @@ -1610,19 +1608,17 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, isSafeForScalarRepl(GEPI, GEPOffset, Info); } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); - if (Length == 0) - return MarkUnsafe(Info, User); - if (Length->isNegative()) + if (!Length || Length->isNegative()) return MarkUnsafe(Info, User); - isSafeMemAccess(Offset, Length->getZExtValue(), 0, - UI.getOperandNo() == 0, Info, MI, + isSafeMemAccess(Offset, Length->getZExtValue(), nullptr, + U.getOperandNo() == 0, Info, MI, true /*AllowWholeAccess*/); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; @@ -1632,7 +1628,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { @@ -1665,39 +1661,39 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (!Info.CheckedPHIs.insert(PN)) return; - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (User *U : I->users()) { + Instruction *UI = cast<Instruction>(U); - if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) { isSafePHISelectUseForScalarRepl(BC, Offset, Info); - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) { // Only allow "bitcast" GEPs for simplicity. We could generalize this, // but would have to prove that we're staying inside of an element being // promoted. if (!GEPI->hasAllZeroIndices()) - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); - } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { if (!LI->isSimple()) - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { - isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) { + isSafePHISelectUseForScalarRepl(UI, Offset, Info); } else { - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); } if (Info.isUnsafe) return; } @@ -1731,12 +1727,12 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); - // If this GEP is non constant then the last operand must have been a + // If this GEP is non-constant then the last operand must have been a // dynamic index into a vector. Pop this now as it has no impact on the // constant part of the offset. if (NonConstant) Indices.pop_back(); - Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize)) MarkUnsafe(Info, GEPI); @@ -1750,12 +1746,12 @@ static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, Type *&EltTy) { if (ArrayType *AT = dyn_cast<ArrayType>(T)) { NumElts = AT->getNumElements(); - EltTy = (NumElts == 0 ? 0 : AT->getElementType()); + EltTy = (NumElts == 0 ? nullptr : AT->getElementType()); return true; } if (StructType *ST = dyn_cast<StructType>(T)) { NumElts = ST->getNumContainedTypes(); - EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0)); + EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0)); for (unsigned n = 1; n < NumElts; ++n) { if (ST->getContainedType(n) != EltTy) return false; @@ -1795,7 +1791,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, bool AllowWholeAccess) { // Check if this is a load/store of the entire alloca. if (Offset == 0 && AllowWholeAccess && - MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) { + MemSize == DL->getTypeAllocSize(Info.AI->getAllocatedType())) { // This can be safe for MemIntrinsics (where MemOpType is 0) and integer // loads/stores (which are essentially the same as the MemIntrinsics with // regard to copying padding between elements). But, if an alloca is @@ -1832,20 +1828,20 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { Type *EltTy; uint64_t EltSize; if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = TD->getStructLayout(ST); + const StructLayout *Layout = DL->getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); - EltSize = TD->getTypeAllocSize(EltTy); + EltSize = DL->getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { EltTy = AT->getElementType(); - EltSize = TD->getTypeAllocSize(EltTy); + EltSize = DL->getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; } else if (VectorType *VT = dyn_cast<VectorType>(T)) { EltTy = VT->getElementType(); - EltSize = TD->getTypeAllocSize(EltTy); + EltSize = DL->getTypeAllocSize(EltTy); if (Offset >= VT->getNumElements() * EltSize) return false; Offset %= EltSize; @@ -1867,8 +1863,8 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl<AllocaInst *> &NewElts) { for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI++); + Use &TheUse = *UI++; + Instruction *User = cast<Instruction>(TheUse.getUser()); if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { RewriteBitCast(BC, AI, Offset, NewElts); @@ -1884,7 +1880,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); if (Offset == 0 && - MemSize == TD->getTypeAllocSize(AI->getAllocatedType())) + MemSize == DL->getTypeAllocSize(AI->getAllocatedType())) RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. @@ -1920,8 +1916,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); } else if (LIType->isIntegerTy() && - TD->getTypeAllocSize(LIType) == - TD->getTypeAllocSize(AI->getAllocatedType())) { + DL->getTypeAllocSize(LIType) == + DL->getTypeAllocSize(AI->getAllocatedType())) { // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } @@ -1947,8 +1943,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && - TD->getTypeAllocSize(SIType) == - TD->getTypeAllocSize(AI->getAllocatedType())) { + DL->getTypeAllocSize(SIType) == + DL->getTypeAllocSize(AI->getAllocatedType())) { // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } @@ -2010,7 +2006,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy) { uint64_t Idx = 0; if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = TD->getStructLayout(ST); + const StructLayout *Layout = DL->getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); Offset -= Layout->getElementOffset(Idx); @@ -2018,7 +2014,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, return Idx; } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { T = AT->getElementType(); - uint64_t EltSize = TD->getTypeAllocSize(T); + uint64_t EltSize = DL->getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2026,7 +2022,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, } VectorType *VT = cast<VectorType>(T); T = VT->getElementType(); - uint64_t EltSize = TD->getTypeAllocSize(T); + uint64_t EltSize = DL->getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2044,10 +2040,10 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, // In this case, it must be the last GEP operand which is dynamic so keep that // aside until we've found the constant GEP offset then add it back in at the // end. - Value* NonConstantIdx = 0; + Value* NonConstantIdx = nullptr; if (!GEPI->hasAllConstantIndices()) NonConstantIdx = Indices.pop_back_val(); - Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); @@ -2114,11 +2110,12 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, if (NewOffset) { // Splice the first element and index 'NewOffset' bytes in. SROA will // split the alloca again later. - Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy()); + unsigned AS = AI->getType()->getAddressSpace(); + Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS)); V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset; + uint64_t EltSize = DL->getTypeAllocSize(IdxTy) - NewOffset; if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2134,7 +2131,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, for (; Idx != NewElts.size() && Size; ++Idx) { IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = TD->getTypeAllocSize(IdxTy); + uint64_t EltSize = DL->getTypeAllocSize(IdxTy); if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2161,7 +2158,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // appropriate type. The "Other" pointer is the pointer that goes to memory // that doesn't have anything to do with the alloca that we are promoting. For // memset, this Value* stays null. - Value *OtherPtr = 0; + Value *OtherPtr = nullptr; unsigned MemAlignment = MI->getAlignment(); if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy if (Inst == MTI->getRawDest()) @@ -2213,7 +2210,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. - Value *OtherElt = 0; + Value *OtherElt = nullptr; unsigned OtherEltAlign = MemAlignment; if (OtherPtr) { @@ -2226,10 +2223,10 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); Type *OtherTy = OtherPtrTy->getElementType(); if (StructType *ST = dyn_cast<StructType>(OtherTy)) { - EltOffset = TD->getStructLayout(ST)->getElementOffset(i); + EltOffset = DL->getStructLayout(ST)->getElementOffset(i); } else { Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); - EltOffset = TD->getTypeAllocSize(EltTy)*i; + EltOffset = DL->getTypeAllocSize(EltTy)*i; } // The alignment of the other pointer is the guaranteed alignment of the @@ -2270,7 +2267,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. - unsigned EltSize = TD->getTypeSizeInBits(ValTy); + unsigned EltSize = DL->getTypeSizeInBits(ValTy); APInt OneVal(EltSize, CI->getZExtValue()); APInt TotalVal(OneVal); // Set each byte. @@ -2300,7 +2297,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // this element. } - unsigned EltSize = TD->getTypeAllocSize(EltTy); + unsigned EltSize = DL->getTypeAllocSize(EltTy); if (!EltSize) continue; @@ -2334,12 +2331,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); // Handle tail padding by extending the operand - if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) + if (DL->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, IntegerType::get(SI->getContext(), AllocaSizeBits)); @@ -2349,15 +2346,15 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - const StructLayout *Layout = TD->getStructLayout(EltSTy); + const StructLayout *Layout = DL->getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); - if (TD->isBigEndian()) - Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy); + if (DL->isBigEndian()) + Shift = AllocaSizeBits-Shift-DL->getTypeAllocSizeInBits(FieldTy); Value *EltVal = SrcVal; if (Shift) { @@ -2366,7 +2363,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } // Truncate down to an integer of the right size. - uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2391,12 +2388,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } else { ArrayType *ATy = cast<ArrayType>(AllocaEltTy); Type *ArrayEltTy = ATy->getElementType(); - uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); - uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy); + uint64_t ElementOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); + uint64_t ElementSizeBits = DL->getTypeSizeInBits(ArrayEltTy); uint64_t Shift; - if (TD->isBigEndian()) + if (DL->isBigEndian()) Shift = AllocaSizeBits-ElementOffset; else Shift = 0; @@ -2430,7 +2427,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } new StoreInst(EltVal, DestField, SI); - if (TD->isBigEndian()) + if (DL->isBigEndian()) Shift -= ElementOffset; else Shift += ElementOffset; @@ -2448,20 +2445,20 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. - const StructLayout *Layout = 0; + const StructLayout *Layout = nullptr; uint64_t ArrayEltBitOffset = 0; if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - Layout = TD->getStructLayout(EltSTy); + Layout = DL->getStructLayout(EltSTy); } else { Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); - ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); + ArrayEltBitOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); } Value *ResultVal = @@ -2473,7 +2470,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, Value *SrcField = NewElts[i]; Type *FieldTy = cast<PointerType>(SrcField->getType())->getElementType(); - uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2504,7 +2501,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, else // Array case. Shift = i*ArrayEltBitOffset; - if (TD->isBigEndian()) + if (DL->isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); if (Shift) { @@ -2521,7 +2518,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, } // Handle tail padding by truncating the result - if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) + if (DL->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); LI->replaceAllUsesWith(ResultVal); @@ -2531,15 +2528,15 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. -static bool HasPadding(Type *Ty, const DataLayout &TD) { +static bool HasPadding(Type *Ty, const DataLayout &DL) { if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Ty = ATy->getElementType(); - return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); + return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty); } // SROA currently handles only Arrays and Structs. StructType *STy = cast<StructType>(Ty); - const StructLayout *SL = TD.getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); unsigned PrevFieldBitOffset = 0; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { unsigned FieldBitOffset = SL->getElementOffsetInBits(i); @@ -2548,7 +2545,7 @@ static bool HasPadding(Type *Ty, const DataLayout &TD) { // previous one. if (i) { unsigned PrevFieldEnd = - PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1)); + PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1)); if (PrevFieldEnd < FieldBitOffset) return true; } @@ -2557,7 +2554,7 @@ static bool HasPadding(Type *Ty, const DataLayout &TD) { // Check for tail padding. if (unsigned EltCount = STy->getNumElements()) { unsigned PrevFieldEnd = PrevFieldBitOffset + - TD.getTypeSizeInBits(STy->getElementType(EltCount-1)); + DL.getTypeSizeInBits(STy->getElementType(EltCount-1)); if (PrevFieldEnd < SL->getSizeInBits()) return true; } @@ -2584,7 +2581,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { // types, but may actually be used. In these cases, we refuse to promote the // struct. if (Info.isMemCpySrc && Info.isMemCpyDst && - HasPadding(AI->getAllocatedType(), *TD)) + HasPadding(AI->getAllocatedType(), *DL)) return false; // If the alloca never has an access to just *part* of it, but is accessed diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp new file mode 100644 index 000000000000..7a73f113b1d9 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -0,0 +1,663 @@ +//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts vector operations into scalar operations, in order +// to expose optimization opportunities on the individual scalar operations. +// It is mainly intended for targets that do not have vector units, but it +// may also be useful for revectorizing code to different vector widths. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "scalarizer" + +namespace { +// Used to store the scattered form of a vector. +typedef SmallVector<Value *, 8> ValueVector; + +// Used to map a vector Value to its scattered form. We use std::map +// because we want iterators to persist across insertion and because the +// values are relatively large. +typedef std::map<Value *, ValueVector> ScatterMap; + +// Lists Instructions that have been replaced with scalar implementations, +// along with a pointer to their scattered forms. +typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList; + +// Provides a very limited vector-like interface for lazily accessing one +// component of a scattered vector or vector pointer. +class Scatterer { +public: + Scatterer() {} + + // Scatter V into Size components. If new instructions are needed, + // insert them before BBI in BB. If Cache is nonnull, use it to cache + // the results. + Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + ValueVector *cachePtr = nullptr); + + // Return component I, creating a new Value for it if necessary. + Value *operator[](unsigned I); + + // Return the number of components. + unsigned size() const { return Size; } + +private: + BasicBlock *BB; + BasicBlock::iterator BBI; + Value *V; + ValueVector *CachePtr; + PointerType *PtrTy; + ValueVector Tmp; + unsigned Size; +}; + +// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp +// called Name that compares X and Y in the same way as FCI. +struct FCmpSplitter { + FCmpSplitter(FCmpInst &fci) : FCI(fci) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name); + } + FCmpInst &FCI; +}; + +// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp +// called Name that compares X and Y in the same way as ICI. +struct ICmpSplitter { + ICmpSplitter(ICmpInst &ici) : ICI(ici) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name); + } + ICmpInst &ICI; +}; + +// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create +// a binary operator like BO called Name with operands X and Y. +struct BinarySplitter { + BinarySplitter(BinaryOperator &bo) : BO(bo) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name); + } + BinaryOperator &BO; +}; + +// Information about a load or store that we're scalarizing. +struct VectorLayout { + VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {} + + // Return the alignment of element I. + uint64_t getElemAlign(unsigned I) { + return MinAlign(VecAlign, I * ElemSize); + } + + // The type of the vector. + VectorType *VecTy; + + // The type of each element. + Type *ElemTy; + + // The alignment of the vector. + uint64_t VecAlign; + + // The size of each element. + uint64_t ElemSize; +}; + +class Scalarizer : public FunctionPass, + public InstVisitor<Scalarizer, bool> { +public: + static char ID; + + Scalarizer() : + FunctionPass(ID) { + initializeScalarizerPass(*PassRegistry::getPassRegistry()); + } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + // InstVisitor methods. They return true if the instruction was scalarized, + // false if nothing changed. + bool visitInstruction(Instruction &) { return false; } + bool visitSelectInst(SelectInst &SI); + bool visitICmpInst(ICmpInst &); + bool visitFCmpInst(FCmpInst &); + bool visitBinaryOperator(BinaryOperator &); + bool visitGetElementPtrInst(GetElementPtrInst &); + bool visitCastInst(CastInst &); + bool visitBitCastInst(BitCastInst &); + bool visitShuffleVectorInst(ShuffleVectorInst &); + bool visitPHINode(PHINode &); + bool visitLoadInst(LoadInst &); + bool visitStoreInst(StoreInst &); + +private: + Scatterer scatter(Instruction *, Value *); + void gather(Instruction *, const ValueVector &); + bool canTransferMetadata(unsigned Kind); + void transferMetadata(Instruction *, const ValueVector &); + bool getVectorLayout(Type *, unsigned, VectorLayout &); + bool finish(); + + template<typename T> bool splitBinary(Instruction &, const T &); + + ScatterMap Scattered; + GatherList Gathered; + unsigned ParallelLoopAccessMDKind; + const DataLayout *DL; +}; + +char Scalarizer::ID = 0; +} // end anonymous namespace + +// This is disabled by default because having separate loads and stores makes +// it more likely that the -combiner-alias-analysis limits will be reached. +static cl::opt<bool> ScalarizeLoadStore + ("scalarize-load-store", cl::Hidden, cl::init(false), + cl::desc("Allow the scalarizer pass to scalarize loads and store")); + +INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations", + false, false) + +Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + ValueVector *cachePtr) + : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) { + Type *Ty = V->getType(); + PtrTy = dyn_cast<PointerType>(Ty); + if (PtrTy) + Ty = PtrTy->getElementType(); + Size = Ty->getVectorNumElements(); + if (!CachePtr) + Tmp.resize(Size, nullptr); + else if (CachePtr->empty()) + CachePtr->resize(Size, nullptr); + else + assert(Size == CachePtr->size() && "Inconsistent vector sizes"); +} + +// Return component I, creating a new Value for it if necessary. +Value *Scatterer::operator[](unsigned I) { + ValueVector &CV = (CachePtr ? *CachePtr : Tmp); + // Try to reuse a previous value. + if (CV[I]) + return CV[I]; + IRBuilder<> Builder(BB, BBI); + if (PtrTy) { + if (!CV[0]) { + Type *Ty = + PointerType::get(PtrTy->getElementType()->getVectorElementType(), + PtrTy->getAddressSpace()); + CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0"); + } + if (I != 0) + CV[I] = Builder.CreateConstGEP1_32(CV[0], I, + V->getName() + ".i" + Twine(I)); + } else { + // Search through a chain of InsertElementInsts looking for element I. + // Record other elements in the cache. The new V is still suitable + // for all uncached indices. + for (;;) { + InsertElementInst *Insert = dyn_cast<InsertElementInst>(V); + if (!Insert) + break; + ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2)); + if (!Idx) + break; + unsigned J = Idx->getZExtValue(); + CV[J] = Insert->getOperand(1); + V = Insert->getOperand(0); + if (I == J) + return CV[J]; + } + CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I), + V->getName() + ".i" + Twine(I)); + } + return CV[I]; +} + +bool Scalarizer::doInitialization(Module &M) { + ParallelLoopAccessMDKind = + M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); + return false; +} + +bool Scalarizer::runOnFunction(Function &F) { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { + BasicBlock *BB = BBI; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { + Instruction *I = II; + bool Done = visit(I); + ++II; + if (Done && I->getType()->isVoidTy()) + I->eraseFromParent(); + } + } + return finish(); +} + +// Return a scattered form of V that can be accessed by Point. V must be a +// vector or a pointer to a vector. +Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { + if (Argument *VArg = dyn_cast<Argument>(V)) { + // Put the scattered form of arguments in the entry block, + // so that it can be used everywhere. + Function *F = VArg->getParent(); + BasicBlock *BB = &F->getEntryBlock(); + return Scatterer(BB, BB->begin(), V, &Scattered[V]); + } + if (Instruction *VOp = dyn_cast<Instruction>(V)) { + // Put the scattered form of an instruction directly after the + // instruction. + BasicBlock *BB = VOp->getParent(); + return Scatterer(BB, std::next(BasicBlock::iterator(VOp)), + V, &Scattered[V]); + } + // In the fallback case, just put the scattered before Point and + // keep the result local to Point. + return Scatterer(Point->getParent(), Point, V); +} + +// Replace Op with the gathered form of the components in CV. Defer the +// deletion of Op and creation of the gathered form to the end of the pass, +// so that we can avoid creating the gathered form if all uses of Op are +// replaced with uses of CV. +void Scalarizer::gather(Instruction *Op, const ValueVector &CV) { + // Since we're not deleting Op yet, stub out its operands, so that it + // doesn't make anything live unnecessarily. + for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) + Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType())); + + transferMetadata(Op, CV); + + // If we already have a scattered form of Op (created from ExtractElements + // of Op itself), replace them with the new form. + ValueVector &SV = Scattered[Op]; + if (!SV.empty()) { + for (unsigned I = 0, E = SV.size(); I != E; ++I) { + Instruction *Old = cast<Instruction>(SV[I]); + CV[I]->takeName(Old); + Old->replaceAllUsesWith(CV[I]); + Old->eraseFromParent(); + } + } + SV = CV; + Gathered.push_back(GatherList::value_type(Op, &SV)); +} + +// Return true if it is safe to transfer the given metadata tag from +// vector to scalar instructions. +bool Scalarizer::canTransferMetadata(unsigned Tag) { + return (Tag == LLVMContext::MD_tbaa + || Tag == LLVMContext::MD_fpmath + || Tag == LLVMContext::MD_tbaa_struct + || Tag == LLVMContext::MD_invariant_load + || Tag == ParallelLoopAccessMDKind); +} + +// Transfer metadata from Op to the instructions in CV if it is known +// to be safe to do so. +void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { + SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; + Op->getAllMetadataOtherThanDebugLoc(MDs); + for (unsigned I = 0, E = CV.size(); I != E; ++I) { + if (Instruction *New = dyn_cast<Instruction>(CV[I])) { + for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator + MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) + if (canTransferMetadata(MI->first)) + New->setMetadata(MI->first, MI->second); + New->setDebugLoc(Op->getDebugLoc()); + } + } +} + +// Try to fill in Layout from Ty, returning true on success. Alignment is +// the alignment of the vector, or 0 if the ABI default should be used. +bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, + VectorLayout &Layout) { + if (!DL) + return false; + + // Make sure we're dealing with a vector. + Layout.VecTy = dyn_cast<VectorType>(Ty); + if (!Layout.VecTy) + return false; + + // Check that we're dealing with full-byte elements. + Layout.ElemTy = Layout.VecTy->getElementType(); + if (DL->getTypeSizeInBits(Layout.ElemTy) != + DL->getTypeStoreSizeInBits(Layout.ElemTy)) + return false; + + if (Alignment) + Layout.VecAlign = Alignment; + else + Layout.VecAlign = DL->getABITypeAlignment(Layout.VecTy); + Layout.ElemSize = DL->getTypeStoreSize(Layout.ElemTy); + return true; +} + +// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name) +// to create an instruction like I with operands X and Y and name Name. +template<typename Splitter> +bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { + VectorType *VT = dyn_cast<VectorType>(I.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(I.getParent(), &I); + Scatterer Op0 = scatter(&I, I.getOperand(0)); + Scatterer Op1 = scatter(&I, I.getOperand(1)); + assert(Op0.size() == NumElems && "Mismatched binary operation"); + assert(Op1.size() == NumElems && "Mismatched binary operation"); + ValueVector Res; + Res.resize(NumElems); + for (unsigned Elem = 0; Elem < NumElems; ++Elem) + Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem], + I.getName() + ".i" + Twine(Elem)); + gather(&I, Res); + return true; +} + +bool Scalarizer::visitSelectInst(SelectInst &SI) { + VectorType *VT = dyn_cast<VectorType>(SI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(SI.getParent(), &SI); + Scatterer Op1 = scatter(&SI, SI.getOperand(1)); + Scatterer Op2 = scatter(&SI, SI.getOperand(2)); + assert(Op1.size() == NumElems && "Mismatched select"); + assert(Op2.size() == NumElems && "Mismatched select"); + ValueVector Res; + Res.resize(NumElems); + + if (SI.getOperand(0)->getType()->isVectorTy()) { + Scatterer Op0 = scatter(&SI, SI.getOperand(0)); + assert(Op0.size() == NumElems && "Mismatched select"); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I], + SI.getName() + ".i" + Twine(I)); + } else { + Value *Op0 = SI.getOperand(0); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I], + SI.getName() + ".i" + Twine(I)); + } + gather(&SI, Res); + return true; +} + +bool Scalarizer::visitICmpInst(ICmpInst &ICI) { + return splitBinary(ICI, ICmpSplitter(ICI)); +} + +bool Scalarizer::visitFCmpInst(FCmpInst &FCI) { + return splitBinary(FCI, FCmpSplitter(FCI)); +} + +bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) { + return splitBinary(BO, BinarySplitter(BO)); +} + +bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { + VectorType *VT = dyn_cast<VectorType>(GEPI.getType()); + if (!VT) + return false; + + IRBuilder<> Builder(GEPI.getParent(), &GEPI); + unsigned NumElems = VT->getNumElements(); + unsigned NumIndices = GEPI.getNumIndices(); + + Scatterer Base = scatter(&GEPI, GEPI.getOperand(0)); + + SmallVector<Scatterer, 8> Ops; + Ops.resize(NumIndices); + for (unsigned I = 0; I < NumIndices; ++I) + Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1)); + + ValueVector Res; + Res.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) { + SmallVector<Value *, 8> Indices; + Indices.resize(NumIndices); + for (unsigned J = 0; J < NumIndices; ++J) + Indices[J] = Ops[J][I]; + Res[I] = Builder.CreateGEP(Base[I], Indices, + GEPI.getName() + ".i" + Twine(I)); + if (GEPI.isInBounds()) + if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I])) + NewGEPI->setIsInBounds(); + } + gather(&GEPI, Res); + return true; +} + +bool Scalarizer::visitCastInst(CastInst &CI) { + VectorType *VT = dyn_cast<VectorType>(CI.getDestTy()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(CI.getParent(), &CI); + Scatterer Op0 = scatter(&CI, CI.getOperand(0)); + assert(Op0.size() == NumElems && "Mismatched cast"); + ValueVector Res; + Res.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(), + CI.getName() + ".i" + Twine(I)); + gather(&CI, Res); + return true; +} + +bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { + VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy()); + VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy()); + if (!DstVT || !SrcVT) + return false; + + unsigned DstNumElems = DstVT->getNumElements(); + unsigned SrcNumElems = SrcVT->getNumElements(); + IRBuilder<> Builder(BCI.getParent(), &BCI); + Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); + ValueVector Res; + Res.resize(DstNumElems); + + if (DstNumElems == SrcNumElems) { + for (unsigned I = 0; I < DstNumElems; ++I) + Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(), + BCI.getName() + ".i" + Twine(I)); + } else if (DstNumElems > SrcNumElems) { + // <M x t1> -> <N*M x t2>. Convert each t1 to <N x t2> and copy the + // individual elements to the destination. + unsigned FanOut = DstNumElems / SrcNumElems; + Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut); + unsigned ResI = 0; + for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) { + Value *V = Op0[Op0I]; + Instruction *VI; + // Look through any existing bitcasts before converting to <N x t2>. + // In the best case, the resulting conversion might be a no-op. + while ((VI = dyn_cast<Instruction>(V)) && + VI->getOpcode() == Instruction::BitCast) + V = VI->getOperand(0); + V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast"); + Scatterer Mid = scatter(&BCI, V); + for (unsigned MidI = 0; MidI < FanOut; ++MidI) + Res[ResI++] = Mid[MidI]; + } + } else { + // <N*M x t1> -> <M x t2>. Convert each group of <N x t1> into a t2. + unsigned FanIn = SrcNumElems / DstNumElems; + Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn); + unsigned Op0I = 0; + for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) { + Value *V = UndefValue::get(MidTy); + for (unsigned MidI = 0; MidI < FanIn; ++MidI) + V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI), + BCI.getName() + ".i" + Twine(ResI) + + ".upto" + Twine(MidI)); + Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(), + BCI.getName() + ".i" + Twine(ResI)); + } + } + gather(&BCI, Res); + return true; +} + +bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) { + VectorType *VT = dyn_cast<VectorType>(SVI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + Scatterer Op0 = scatter(&SVI, SVI.getOperand(0)); + Scatterer Op1 = scatter(&SVI, SVI.getOperand(1)); + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) { + int Selector = SVI.getMaskValue(I); + if (Selector < 0) + Res[I] = UndefValue::get(VT->getElementType()); + else if (unsigned(Selector) < Op0.size()) + Res[I] = Op0[Selector]; + else + Res[I] = Op1[Selector - Op0.size()]; + } + gather(&SVI, Res); + return true; +} + +bool Scalarizer::visitPHINode(PHINode &PHI) { + VectorType *VT = dyn_cast<VectorType>(PHI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(PHI.getParent(), &PHI); + ValueVector Res; + Res.resize(NumElems); + + unsigned NumOps = PHI.getNumOperands(); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps, + PHI.getName() + ".i" + Twine(I)); + + for (unsigned I = 0; I < NumOps; ++I) { + Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I)); + BasicBlock *IncomingBlock = PHI.getIncomingBlock(I); + for (unsigned J = 0; J < NumElems; ++J) + cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock); + } + gather(&PHI, Res); + return true; +} + +bool Scalarizer::visitLoadInst(LoadInst &LI) { + if (!ScalarizeLoadStore) + return false; + if (!LI.isSimple()) + return false; + + VectorLayout Layout; + if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout)) + return false; + + unsigned NumElems = Layout.VecTy->getNumElements(); + IRBuilder<> Builder(LI.getParent(), &LI); + Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I), + LI.getName() + ".i" + Twine(I)); + gather(&LI, Res); + return true; +} + +bool Scalarizer::visitStoreInst(StoreInst &SI) { + if (!ScalarizeLoadStore) + return false; + if (!SI.isSimple()) + return false; + + VectorLayout Layout; + Value *FullValue = SI.getValueOperand(); + if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout)) + return false; + + unsigned NumElems = Layout.VecTy->getNumElements(); + IRBuilder<> Builder(SI.getParent(), &SI); + Scatterer Ptr = scatter(&SI, SI.getPointerOperand()); + Scatterer Val = scatter(&SI, FullValue); + + ValueVector Stores; + Stores.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) { + unsigned Align = Layout.getElemAlign(I); + Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align); + } + transferMetadata(&SI, Stores); + return true; +} + +// Delete the instructions that we scalarized. If a full vector result +// is still needed, recreate it using InsertElements. +bool Scalarizer::finish() { + if (Gathered.empty()) + return false; + for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); + GMI != GME; ++GMI) { + Instruction *Op = GMI->first; + ValueVector &CV = *GMI->second; + if (!Op->use_empty()) { + // The value is still needed, so recreate it using a series of + // InsertElements. + Type *Ty = Op->getType(); + Value *Res = UndefValue::get(Ty); + BasicBlock *BB = Op->getParent(); + unsigned Count = Ty->getVectorNumElements(); + IRBuilder<> Builder(BB, Op); + if (isa<PHINode>(Op)) + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + for (unsigned I = 0; I < Count; ++I) + Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I), + Op->getName() + ".upto" + Twine(I)); + Res->takeName(Op); + Op->replaceAllUsesWith(Res); + } + Op->eraseFromParent(); + } + Gathered.clear(); + Scattered.clear(); + return true; +} + +FunctionPass *llvm::createScalarizerPass() { + return new Scalarizer(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp new file mode 100644 index 000000000000..6557ce4575dd --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -0,0 +1,776 @@ +//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Loop unrolling may create many similar GEPs for array accesses. +// e.g., a 2-level loop +// +// float a[32][32]; // global variable +// +// for (int i = 0; i < 2; ++i) { +// for (int j = 0; j < 2; ++j) { +// ... +// ... = a[x + i][y + j]; +// ... +// } +// } +// +// will probably be unrolled to: +// +// gep %a, 0, %x, %y; load +// gep %a, 0, %x, %y + 1; load +// gep %a, 0, %x + 1, %y; load +// gep %a, 0, %x + 1, %y + 1; load +// +// LLVM's GVN does not use partial redundancy elimination yet, and is thus +// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs +// significant slowdown in targets with limited addressing modes. For instance, +// because the PTX target does not support the reg+reg addressing mode, the +// NVPTX backend emits PTX code that literally computes the pointer address of +// each GEP, wasting tons of registers. It emits the following PTX for the +// first load and similar PTX for other loads. +// +// mov.u32 %r1, %x; +// mov.u32 %r2, %y; +// mul.wide.u32 %rl2, %r1, 128; +// mov.u64 %rl3, a; +// add.s64 %rl4, %rl3, %rl2; +// mul.wide.u32 %rl5, %r2, 4; +// add.s64 %rl6, %rl4, %rl5; +// ld.global.f32 %f1, [%rl6]; +// +// To reduce the register pressure, the optimization implemented in this file +// merges the common part of a group of GEPs, so we can compute each pointer +// address by adding a simple offset to the common part, saving many registers. +// +// It works by splitting each GEP into a variadic base and a constant offset. +// The variadic base can be computed once and reused by multiple GEPs, and the +// constant offsets can be nicely folded into the reg+immediate addressing mode +// (supported by most targets) without using any extra register. +// +// For instance, we transform the four GEPs and four loads in the above example +// into: +// +// base = gep a, 0, x, y +// load base +// laod base + 1 * sizeof(float) +// load base + 32 * sizeof(float) +// load base + 33 * sizeof(float) +// +// Given the transformed IR, a backend that supports the reg+immediate +// addressing mode can easily fold the pointer arithmetics into the loads. For +// example, the NVPTX backend can easily fold the pointer arithmetics into the +// ld.global.f32 instructions, and the resultant PTX uses much fewer registers. +// +// mov.u32 %r1, %tid.x; +// mov.u32 %r2, %tid.y; +// mul.wide.u32 %rl2, %r1, 128; +// mov.u64 %rl3, a; +// add.s64 %rl4, %rl3, %rl2; +// mul.wide.u32 %rl5, %r2, 4; +// add.s64 %rl6, %rl4, %rl5; +// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX +// ld.global.f32 %f2, [%rl6+4]; // much better +// ld.global.f32 %f3, [%rl6+128]; // much better +// ld.global.f32 %f4, [%rl6+132]; // much better +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +static cl::opt<bool> DisableSeparateConstOffsetFromGEP( + "disable-separate-const-offset-from-gep", cl::init(false), + cl::desc("Do not separate the constant offset from a GEP instruction"), + cl::Hidden); + +namespace { + +/// \brief A helper class for separating a constant offset from a GEP index. +/// +/// In real programs, a GEP index may be more complicated than a simple addition +/// of something and a constant integer which can be trivially splitted. For +/// example, to split ((a << 3) | 5) + b, we need to search deeper for the +/// constant offset, so that we can separate the index to (a << 3) + b and 5. +/// +/// Therefore, this class looks into the expression that computes a given GEP +/// index, and tries to find a constant integer that can be hoisted to the +/// outermost level of the expression as an addition. Not every constant in an +/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + +/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, +/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). +class ConstantOffsetExtractor { + public: + /// Extracts a constant offset from the given GEP index. It outputs the + /// numeric value of the extracted constant offset (0 if failed), and a + /// new index representing the remainder (equal to the original index minus + /// the constant offset). + /// \p Idx The given GEP index + /// \p NewIdx The new index to replace (output) + /// \p DL The datalayout of the module + /// \p GEP The given GEP + static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL, + GetElementPtrInst *GEP); + /// Looks for a constant offset without extracting it. The meaning of the + /// arguments and the return value are the same as Extract. + static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP); + + private: + ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt) + : DL(Layout), IP(InsertionPt) {} + /// Searches the expression that computes V for a non-zero constant C s.t. + /// V can be reassociated into the form V' + C. If the searching is + /// successful, returns C and update UserChain as a def-use chain from C to V; + /// otherwise, UserChain is empty. + /// + /// \p V The given expression + /// \p SignExtended Whether V will be sign-extended in the computation of the + /// GEP index + /// \p ZeroExtended Whether V will be zero-extended in the computation of the + /// GEP index + /// \p NonNegative Whether V is guaranteed to be non-negative. For example, + /// an index of an inbounds GEP is guaranteed to be + /// non-negative. Levaraging this, we can better split + /// inbounds GEPs. + APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); + /// A helper function to look into both operands of a binary operator. + APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, + bool ZeroExtended); + /// After finding the constant offset C from the GEP index I, we build a new + /// index I' s.t. I' + C = I. This function builds and returns the new + /// index I' according to UserChain produced by function "find". + /// + /// The building conceptually takes two steps: + /// 1) iteratively distribute s/zext towards the leaves of the expression tree + /// that computes I + /// 2) reassociate the expression tree to the form I' + C. + /// + /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute + /// sext to a, b and 5 so that we have + /// sext(a) + (sext(b) + 5). + /// Then, we reassociate it to + /// (sext(a) + sext(b)) + 5. + /// Given this form, we know I' is sext(a) + sext(b). + Value *rebuildWithoutConstOffset(); + /// After the first step of rebuilding the GEP index without the constant + /// offset, distribute s/zext to the operands of all operators in UserChain. + /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). + /// + /// The function also updates UserChain to point to new subexpressions after + /// distributing s/zext. e.g., the old UserChain of the above example is + /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), + /// and the new UserChain is + /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) + /// + /// \p ChainIndex The index to UserChain. ChainIndex is initially + /// UserChain.size() - 1, and is decremented during + /// the recursion. + Value *distributeExtsAndCloneChain(unsigned ChainIndex); + /// Reassociates the GEP index to the form I' + C and returns I'. + Value *removeConstOffset(unsigned ChainIndex); + /// A helper function to apply ExtInsts, a list of s/zext, to value V. + /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function + /// returns "sext i32 (zext i16 V to i32) to i64". + Value *applyExts(Value *V); + + /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. + bool NoCommonBits(Value *LHS, Value *RHS) const; + /// Computes which bits are known to be one or zero. + /// \p KnownOne Mask of all bits that are known to be one. + /// \p KnownZero Mask of all bits that are known to be zero. + void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; + /// A helper function that returns whether we can trace into the operands + /// of binary operator BO for a constant offset. + /// + /// \p SignExtended Whether BO is surrounded by sext + /// \p ZeroExtended Whether BO is surrounded by zext + /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound + /// array index. + bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, + bool NonNegative); + + /// The path from the constant offset to the old GEP index. e.g., if the GEP + /// index is "a * b + (c + 5)". After running function find, UserChain[0] will + /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and + /// UserChain[2] will be the entire expression "a * b + (c + 5)". + /// + /// This path helps to rebuild the new GEP index. + SmallVector<User *, 8> UserChain; + /// A data structure used in rebuildWithoutConstOffset. Contains all + /// sext/zext instructions along UserChain. + SmallVector<CastInst *, 16> ExtInsts; + /// The data layout of the module. Used in ComputeKnownBits. + const DataLayout *DL; + Instruction *IP; /// Insertion position of cloned instructions. +}; + +/// \brief A pass that tries to split every GEP in the function into a variadic +/// base and a constant offset. It is a FunctionPass because searching for the +/// constant offset may inspect other basic blocks. +class SeparateConstOffsetFromGEP : public FunctionPass { + public: + static char ID; + SeparateConstOffsetFromGEP() : FunctionPass(ID) { + initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DataLayoutPass>(); + AU.addRequired<TargetTransformInfo>(); + } + + bool doInitialization(Module &M) override { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (DLP == nullptr) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); + return false; + } + + bool runOnFunction(Function &F) override; + + private: + /// Tries to split the given GEP into a variadic base and a constant offset, + /// and returns true if the splitting succeeds. + bool splitGEP(GetElementPtrInst *GEP); + /// Finds the constant offset within each index, and accumulates them. This + /// function only inspects the GEP without changing it. The output + /// NeedsExtraction indicates whether we can extract a non-zero constant + /// offset from any index. + int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + /// Canonicalize array indices to pointer-size integers. This helps to + /// simplify the logic of splitting a GEP. For example, if a + b is a + /// pointer-size integer, we have + /// gep base, a + b = gep (gep base, a), b + /// However, this equality may not hold if the size of a + b is smaller than + /// the pointer size, because LLVM conceptually sign-extends GEP indices to + /// pointer size before computing the address + /// (http://llvm.org/docs/LangRef.html#id181). + /// + /// This canonicalization is very likely already done in clang and + /// instcombine. Therefore, the program will probably remain the same. + /// + /// Returns true if the module changes. + /// + /// Verified in @i32_add in split-gep.ll + bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + + const DataLayout *DL; +}; +} // anonymous namespace + +char SeparateConstOffsetFromGEP::ID = 0; +INITIALIZE_PASS_BEGIN( + SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", + "Split GEPs to a variadic base and a constant offset for better CSE", false, + false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(DataLayoutPass) +INITIALIZE_PASS_END( + SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", + "Split GEPs to a variadic base and a constant offset for better CSE", false, + false) + +FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() { + return new SeparateConstOffsetFromGEP(); +} + +bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, + bool ZeroExtended, + BinaryOperator *BO, + bool NonNegative) { + // We only consider ADD, SUB and OR, because a non-zero constant found in + // expressions composed of these operations can be easily hoisted as a + // constant offset by reassociation. + if (BO->getOpcode() != Instruction::Add && + BO->getOpcode() != Instruction::Sub && + BO->getOpcode() != Instruction::Or) { + return false; + } + + Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); + // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS + // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). + if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS)) + return false; + + // In addition, tracing into BO requires that its surrounding s/zext (if + // any) is distributable to both operands. + // + // Suppose BO = A op B. + // SignExtended | ZeroExtended | Distributable? + // --------------+--------------+---------------------------------- + // 0 | 0 | true because no s/zext exists + // 0 | 1 | zext(BO) == zext(A) op zext(B) + // 1 | 0 | sext(BO) == sext(A) op sext(B) + // 1 | 1 | zext(sext(BO)) == + // | | zext(sext(A)) op zext(sext(B)) + if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { + // If a + b >= 0 and (a >= 0 or b >= 0), then + // sext(a + b) = sext(a) + sext(b) + // even if the addition is not marked nsw. + // + // Leveraging this invarient, we can trace into an sext'ed inbound GEP + // index if the constant offset is non-negative. + // + // Verified in @sext_add in split-gep.ll. + if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { + if (!ConstLHS->isNegative()) + return true; + } + if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { + if (!ConstRHS->isNegative()) + return true; + } + } + + // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) + // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Sub) { + if (SignExtended && !BO->hasNoSignedWrap()) + return false; + if (ZeroExtended && !BO->hasNoUnsignedWrap()) + return false; + } + + return true; +} + +APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, + bool SignExtended, + bool ZeroExtended) { + // BO being non-negative does not shed light on whether its operands are + // non-negative. Clear the NonNegative flag here. + APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, + /* NonNegative */ false); + // If we found a constant offset in the left operand, stop and return that. + // This shortcut might cause us to miss opportunities of combining the + // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. + // However, such cases are probably already handled by -instcombine, + // given this pass runs after the standard optimizations. + if (ConstantOffset != 0) return ConstantOffset; + ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, + /* NonNegative */ false); + // If U is a sub operator, negate the constant offset found in the right + // operand. + if (BO->getOpcode() == Instruction::Sub) + ConstantOffset = -ConstantOffset; + return ConstantOffset; +} + +APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, + bool ZeroExtended, bool NonNegative) { + // TODO(jingyue): We could trace into integer/pointer casts, such as + // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only + // integers because it gives good enough results for our benchmarks. + unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); + + // We cannot do much with Values that are not a User, such as an Argument. + User *U = dyn_cast<User>(V); + if (U == nullptr) return APInt(BitWidth, 0); + + APInt ConstantOffset(BitWidth, 0); + if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { + // Hooray, we found it! + ConstantOffset = CI->getValue(); + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { + // Trace into subexpressions for more hoisting opportunities. + if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { + ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + } + } else if (isa<SExtInst>(V)) { + ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, + ZeroExtended, NonNegative).sext(BitWidth); + } else if (isa<ZExtInst>(V)) { + // As an optimization, we can clear the SignExtended flag because + // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. + // + // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. + ConstantOffset = + find(U->getOperand(0), /* SignExtended */ false, + /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); + } + + // If we found a non-zero constant offset, add it to the path for + // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't + // help this optimization. + if (ConstantOffset != 0) + UserChain.push_back(U); + return ConstantOffset; +} + +Value *ConstantOffsetExtractor::applyExts(Value *V) { + Value *Current = V; + // ExtInsts is built in the use-def order. Therefore, we apply them to V + // in the reversed order. + for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { + if (Constant *C = dyn_cast<Constant>(Current)) { + // If Current is a constant, apply s/zext using ConstantExpr::getCast. + // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. + Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); + } else { + Instruction *Ext = (*I)->clone(); + Ext->setOperand(0, Current); + Ext->insertBefore(IP); + Current = Ext; + } + } + return Current; +} + +Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { + distributeExtsAndCloneChain(UserChain.size() - 1); + // Remove all nullptrs (used to be s/zext) from UserChain. + unsigned NewSize = 0; + for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) { + if (*I != nullptr) { + UserChain[NewSize] = *I; + NewSize++; + } + } + UserChain.resize(NewSize); + return removeConstOffset(UserChain.size() - 1); +} + +Value * +ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { + User *U = UserChain[ChainIndex]; + if (ChainIndex == 0) { + assert(isa<ConstantInt>(U)); + // If U is a ConstantInt, applyExts will return a ConstantInt as well. + return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); + } + + if (CastInst *Cast = dyn_cast<CastInst>(U)) { + assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) && + "We only traced into two types of CastInst: sext and zext"); + ExtInsts.push_back(Cast); + UserChain[ChainIndex] = nullptr; + return distributeExtsAndCloneChain(ChainIndex - 1); + } + + // Function find only trace into BinaryOperator and CastInst. + BinaryOperator *BO = cast<BinaryOperator>(U); + // OpNo = which operand of BO is UserChain[ChainIndex - 1] + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); + Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); + Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + + BinaryOperator *NewBO = nullptr; + if (OpNo == 0) { + NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, + BO->getName(), IP); + } else { + NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, + BO->getName(), IP); + } + return UserChain[ChainIndex] = NewBO; +} + +Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { + if (ChainIndex == 0) { + assert(isa<ConstantInt>(UserChain[ChainIndex])); + return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); + } + + BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); + assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); + Value *NextInChain = removeConstOffset(ChainIndex - 1); + Value *TheOther = BO->getOperand(1 - OpNo); + + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. + if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { + if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; + } + + if (BO->getOpcode() == Instruction::Or) { + // Rebuild "or" as "add", because "or" may be invalid for the new + // epxression. + // + // For instance, given + // a | (b + 5) where a and b + 5 have no common bits, + // we can extract 5 as the constant offset. + // + // However, reusing the "or" in the new index would give us + // (a | b) + 5 + // which does not equal a | (b + 5). + // + // Replacing the "or" with "add" is fine, because + // a | (b + 5) = a + (b + 5) = (a + b) + 5 + return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1), + BO->getName(), IP); + } + + // We can reuse BO in this case, because the new expression shares the same + // instruction type and BO is used at most once. + assert(BO->getNumUses() <= 1 && + "distributeExtsAndCloneChain clones each BinaryOperator in " + "UserChain, so no one should be used more than " + "once"); + BO->setOperand(OpNo, NextInChain); + BO->setHasNoSignedWrap(false); + BO->setHasNoUnsignedWrap(false); + // Make sure it appears after all instructions we've inserted so far. + BO->moveBefore(IP); + return BO; +} + +int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx, + const DataLayout *DL, + GetElementPtrInst *GEP) { + ConstantOffsetExtractor Extractor(DL, GEP); + // Find a non-zero constant offset first. + APInt ConstantOffset = + Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()); + if (ConstantOffset != 0) { + // Separates the constant offset from the GEP index. + NewIdx = Extractor.rebuildWithoutConstOffset(); + } + return ConstantOffset.getSExtValue(); +} + +int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL, + GetElementPtrInst *GEP) { + // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. + return ConstantOffsetExtractor(DL, GEP) + .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()) + .getSExtValue(); +} + +void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, + APInt &KnownZero) const { + IntegerType *IT = cast<IntegerType>(V->getType()); + KnownOne = APInt(IT->getBitWidth(), 0); + KnownZero = APInt(IT->getBitWidth(), 0); + llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); +} + +bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { + assert(LHS->getType() == RHS->getType() && + "LHS and RHS should have the same type"); + APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero; + ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero); + ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero); + return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); +} + +bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( + GetElementPtrInst *GEP) { + bool Changed = false; + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + gep_type_iterator GTI = gep_type_begin(*GEP); + for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); + I != E; ++I, ++GTI) { + // Skip struct member indices which must be i32. + if (isa<SequentialType>(*GTI)) { + if ((*I)->getType() != IntPtrTy) { + *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP); + Changed = true; + } + } + } + return Changed; +} + +int64_t +SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, + bool &NeedsExtraction) { + NeedsExtraction = false; + int64_t AccumulativeByteOffset = 0; + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + // Tries to extract a constant offset from this GEP index. + int64_t ConstantOffset = + ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP); + if (ConstantOffset != 0) { + NeedsExtraction = true; + // A GEP may have multiple indices. We accumulate the extracted + // constant offset to a byte offset, and later offset the remainder of + // the original GEP with this byte offset. + AccumulativeByteOffset += + ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); + } + } + } + return AccumulativeByteOffset; +} + +bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { + // Skip vector GEPs. + if (GEP->getType()->isVectorTy()) + return false; + + // The backend can already nicely handle the case where all indices are + // constant. + if (GEP->hasAllConstantIndices()) + return false; + + bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); + + bool NeedsExtraction; + int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); + + if (!NeedsExtraction) + return Changed; + // Before really splitting the GEP, check whether the backend supports the + // addressing mode we are about to produce. If no, this splitting probably + // won't be beneficial. + TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), + /*BaseGV=*/nullptr, AccumulativeByteOffset, + /*HasBaseReg=*/true, /*Scale=*/0)) { + return Changed; + } + + // Remove the constant offset in each GEP index. The resultant GEP computes + // the variadic base. + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + Value *NewIdx = nullptr; + // Tries to extract a constant offset from this GEP index. + int64_t ConstantOffset = + ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP); + if (ConstantOffset != 0) { + assert(NewIdx != nullptr && + "ConstantOffset != 0 implies NewIdx is set"); + GEP->setOperand(I, NewIdx); + } + } + } + // Clear the inbounds attribute because the new index may be off-bound. + // e.g., + // + // b = add i64 a, 5 + // addr = gep inbounds float* p, i64 b + // + // is transformed to: + // + // addr2 = gep float* p, i64 a + // addr = gep float* addr2, i64 5 + // + // If a is -4, although the old index b is in bounds, the new index a is + // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the + // inbounds keyword is not present, the offsets are added to the base + // address with silently-wrapping two's complement arithmetic". + // Therefore, the final code will be a semantically equivalent. + // + // TODO(jingyue): do some range analysis to keep as many inbounds as + // possible. GEPs with inbounds are more friendly to alias analysis. + GEP->setIsInBounds(false); + + // Offsets the base with the accumulative byte offset. + // + // %gep ; the base + // ... %gep ... + // + // => add the offset + // + // %gep2 ; clone of %gep + // %new.gep = gep %gep2, <offset / sizeof(*%gep)> + // %gep ; will be removed + // ... %gep ... + // + // => replace all uses of %gep with %new.gep and remove %gep + // + // %gep2 ; clone of %gep + // %new.gep = gep %gep2, <offset / sizeof(*%gep)> + // ... %new.gep ... + // + // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an + // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep): + // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the + // type of %gep. + // + // %gep2 ; clone of %gep + // %0 = bitcast %gep2 to i8* + // %uglygep = gep %0, <offset> + // %new.gep = bitcast %uglygep to <type of %gep> + // ... %new.gep ... + Instruction *NewGEP = GEP->clone(); + NewGEP->insertBefore(GEP); + + uint64_t ElementTypeSizeOfGEP = + DL->getTypeAllocSize(GEP->getType()->getElementType()); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { + // Very likely. As long as %gep is natually aligned, the byte offset we + // extracted should be a multiple of sizeof(*%gep). + // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we + // cast ElementTypeSizeOfGEP to signed. + int64_t Index = + AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP); + NewGEP = GetElementPtrInst::Create( + NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + } else { + // Unlikely but possible. For example, + // #pragma pack(1) + // struct S { + // int a[3]; + // int64 b[8]; + // }; + // #pragma pack() + // + // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After + // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is + // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of + // sizeof(int64). + // + // Emit an uglygep in this case. + Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), + GEP->getPointerAddressSpace()); + NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); + NewGEP = GetElementPtrInst::Create( + NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), + "uglygep", GEP); + if (GEP->getType() != I8PtrTy) + NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); + } + + GEP->replaceAllUsesWith(NewGEP); + GEP->eraseFromParent(); + + return true; +} + +bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { + if (DisableSeparateConstOffsetFromGEP) + return false; + + bool Changed = false; + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) { + Changed |= splitGEP(GEP); + } + // No need to split GEP ConstantExprs because all its indices are constant + // already. + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 8371f6d35279..5d5606ba47b0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -21,23 +21,24 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "simplifycfg" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "simplifycfg" + STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { @@ -46,9 +47,9 @@ struct CFGSimplifyPass : public FunctionPass { CFGSimplifyPass() : FunctionPass(ID) { initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetTransformInfo>(); } }; @@ -71,7 +72,7 @@ FunctionPass *llvm::createCFGSimplificationPass() { static bool mergeEmptyReturnBlocks(Function &F) { bool Changed = false; - BasicBlock *RetBlock = 0; + BasicBlock *RetBlock = nullptr; // Scan all the blocks in the function, looking for empty return blocks. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { @@ -79,7 +80,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { // Only look at return blocks. ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); - if (Ret == 0) continue; + if (!Ret) continue; // Only look at the block if it is empty or the only other thing in it is a // single PHI node that is the operand to the return. @@ -98,7 +99,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { } // If this is the first returning block, remember it and keep going. - if (RetBlock == 0) { + if (!RetBlock) { RetBlock = &BB; continue; } @@ -119,7 +120,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { // If the canonical return block has no PHI node, create one now. PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); - if (RetBlockPHI == 0) { + if (!RetBlockPHI) { Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0); pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock); RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), @@ -145,7 +146,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *TD) { + const DataLayout *DL) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -154,7 +155,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, TD)) { + if (SimplifyCFG(BBIt++, TTI, DL)) { LocalChange = true; ++NumSimpl; } @@ -168,11 +169,15 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // simplify the CFG. // bool CFGSimplifyPass::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, TD); + EverChanged |= iterativelySimplifyCFG(F, TTI, DL); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -186,7 +191,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, TD); + EverChanged = iterativelySimplifyCFG(F, TTI, DL); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index d4595bb373e6..7348c45c5d37 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -12,20 +12,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sink" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Assembly/Writer.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "sink" + STATISTIC(NumSunk, "Number of instructions sunk"); STATISTIC(NumSinkIter, "Number of sinking iterations"); @@ -34,6 +35,7 @@ namespace { DominatorTree *DT; LoopInfo *LI; AliasAnalysis *AA; + const DataLayout *DL; public: static char ID; // Pass identification @@ -41,15 +43,15 @@ namespace { initializeSinkingPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); } private: @@ -63,7 +65,7 @@ namespace { char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) @@ -77,15 +79,14 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, // This may leave a referencing dbg_value in the original block, before // the definition of the vreg. Dwarf generator handles this although the // user might not get the right info at runtime. - for (Value::use_iterator I = Inst->use_begin(), - E = Inst->use_end(); I != E; ++I) { + for (Use &U : Inst->uses()) { // Determine the block of the use. - Instruction *UseInst = cast<Instruction>(*I); + Instruction *UseInst = cast<Instruction>(U.getUser()); BasicBlock *UseBlock = UseInst->getParent(); if (PHINode *PN = dyn_cast<PHINode>(UseInst)) { // PHI nodes use the operand in the predecessor block, not the block with // the PHI. - unsigned Num = PHINode::getIncomingValueNumForOperand(I.getOperandNo()); + unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo()); UseBlock = PN->getIncomingBlock(Num); } // Check that it dominates. @@ -96,9 +97,11 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, } bool Sinking::runOnFunction(Function &F) { - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; bool MadeChange, EverMadeChange = false; @@ -194,7 +197,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) + if (!isSafeToSpeculativelyExecute(Inst, DL)) return false; // We don't want to sink across a critical edge if we don't dominate the @@ -205,7 +208,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, // Don't sink instructions into a loop. Loop *succ = LI->getLoopFor(SuccToSinkTo); Loop *cur = LI->getLoopFor(Inst->getParent()); - if (succ != 0 && succ != cur) + if (succ != nullptr && succ != cur) return false; } @@ -218,6 +221,13 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, /// instruction out of its current block into a successor. bool Sinking::SinkInstruction(Instruction *Inst, SmallPtrSet<Instruction *, 8> &Stores) { + + // Don't sink static alloca instructions. CodeGen assumes allocas outside the + // entry block are dynamically sized stack objects. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst)) + if (AI->isStaticAlloca()) + return false; + // Check if it's safe to move the instruction. if (!isSafeToMove(Inst, AA, Stores)) return false; @@ -232,14 +242,14 @@ bool Sinking::SinkInstruction(Instruction *Inst, // SuccToSinkTo - This is the successor to sink this instruction to, once we // decide. - BasicBlock *SuccToSinkTo = 0; + BasicBlock *SuccToSinkTo = nullptr; // Instructions can only be sunk if all their uses are in blocks // dominated by one of the successors. // Look at all the postdominators and see if we can sink it in one. DomTreeNode *DTN = DT->getNode(Inst->getParent()); for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); - I != E && SuccToSinkTo == 0; ++I) { + I != E && SuccToSinkTo == nullptr; ++I) { BasicBlock *Candidate = (*I)->getBlock(); if ((*I)->getIDom()->getBlock() == Inst->getParent() && IsAcceptableTarget(Inst, Candidate)) @@ -249,19 +259,19 @@ bool Sinking::SinkInstruction(Instruction *Inst, // If no suitable postdominator was found, look at all the successors and // decide which one we should sink to, if any. for (succ_iterator I = succ_begin(Inst->getParent()), - E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) { + E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) { if (IsAcceptableTarget(Inst, *I)) SuccToSinkTo = *I; } // If we couldn't find a block to sink to, ignore this instruction. - if (SuccToSinkTo == 0) + if (!SuccToSinkTo) return false; DEBUG(dbgs() << "Sink" << *Inst << " ("; - WriteAsOperand(dbgs(), Inst->getParent(), false); + Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> "; - WriteAsOperand(dbgs(), SuccToSinkTo, false); + SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n"); // Move the instruction. diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 5045ff8fdfda..b9673ed655e0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "structurizecfg" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" @@ -15,12 +14,14 @@ #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/Module.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; using namespace llvm::PatternMatch; +#define DEBUG_TYPE "structurizecfg" + namespace { // Definition of the complex types used in this pass. @@ -64,14 +65,14 @@ public: /// \brief Start a new query NearestCommonDominator(DominatorTree *DomTree) { DT = DomTree; - Result = 0; + Result = nullptr; } /// \brief Add BB to the resulting dominator void addBlock(BasicBlock *BB, bool Remember = true) { DomTreeNode *Node = DT->getNode(BB); - if (Result == 0) { + if (!Result) { unsigned Numbering = 0; for (;Node;Node = Node->getIDom()) IndexMap[Node] = ++Numbering; @@ -235,18 +236,18 @@ public: } using Pass::doInitialization; - virtual bool doInitialization(Region *R, RGPassManager &RGM); + bool doInitialization(Region *R, RGPassManager &RGM) override; - virtual bool runOnRegion(Region *R, RGPassManager &RGM); + bool runOnRegion(Region *R, RGPassManager &RGM) override; - virtual const char *getPassName() const { + const char *getPassName() const override { return "Structurize control flow"; } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LowerSwitchID); - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); } }; @@ -258,8 +259,8 @@ char StructurizeCFG::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) -INITIALIZE_PASS_DEPENDENCY(RegionInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) @@ -277,10 +278,9 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { /// \brief Build up the general order of nodes void StructurizeCFG::orderNodes() { - scc_iterator<Region *> I = scc_begin(ParentRegion), - E = scc_end(ParentRegion); - for (Order.clear(); I != E; ++I) { - std::vector<RegionNode *> &Nodes = *I; + scc_iterator<Region *> I = scc_begin(ParentRegion); + for (Order.clear(); !I.isAtEnd(); ++I) { + const std::vector<RegionNode *> &Nodes = *I; Order.append(Nodes.begin(), Nodes.end()); } } @@ -326,16 +326,10 @@ Value *StructurizeCFG::invert(Value *Condition) { if (Instruction *Inst = dyn_cast<Instruction>(Condition)) { // Third: Check all the users for an invert BasicBlock *Parent = Inst->getParent(); - for (Value::use_iterator I = Condition->use_begin(), - E = Condition->use_end(); I != E; ++I) { - - Instruction *User = dyn_cast<Instruction>(*I); - if (!User || User->getParent() != Parent) - continue; - - if (match(*I, m_Not(m_Specific(Condition)))) - return *I; - } + for (User *U : Condition->users()) + if (Instruction *I = dyn_cast<Instruction>(U)) + if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) + return I; // Last option: Create a new instruction return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); @@ -412,11 +406,11 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { } else { // It's an exit from a sub region - while(R->getParent() != ParentRegion) + while (R->getParent() != ParentRegion) R = R->getParent(); // Edge from inside a subregion to its entry, ignore it - if (R == N) + if (*R == *N) continue; BasicBlock *Entry = R->getEntry(); @@ -460,10 +454,7 @@ void StructurizeCFG::insertConditions(bool Loops) { Value *Default = Loops ? BoolTrue : BoolFalse; SSAUpdater PhiInserter; - for (BranchVector::iterator I = Conds.begin(), - E = Conds.end(); I != E; ++I) { - - BranchInst *Term = *I; + for (BranchInst *Term : Conds) { assert(Term->isConditional()); BasicBlock *Parent = Term->getParent(); @@ -479,7 +470,7 @@ void StructurizeCFG::insertConditions(bool Loops) { NearestCommonDominator Dominator(DT); Dominator.addBlock(Parent, false); - Value *ParentValue = 0; + Value *ParentValue = nullptr; for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); PI != PE; ++PI) { @@ -598,7 +589,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, if (Node->isSubRegion()) { Region *SubRegion = Node->getNodeAs<Region>(); BasicBlock *OldExit = SubRegion->getExit(); - BasicBlock *Dominator = 0; + BasicBlock *Dominator = nullptr; // Find all the edges from the sub region to the exit for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit); @@ -685,7 +676,8 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, /// \brief Set the previous node void StructurizeCFG::setPrevNode(BasicBlock *BB) { - PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0; + PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) + : nullptr; } /// \brief Does BB dominate all the predicates of Node ? @@ -706,7 +698,7 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { bool Dominated = false; // Regionentry is always true - if (PrevNode == 0) + if (!PrevNode) return true; for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); @@ -813,11 +805,11 @@ void StructurizeCFG::createFlow() { Conditions.clear(); LoopConds.clear(); - PrevNode = 0; + PrevNode = nullptr; Visited.clear(); while (!Order.empty()) { - handleLoops(EntryDominatesExit, 0); + handleLoops(EntryDominatesExit, nullptr); } if (PrevNode) @@ -830,25 +822,19 @@ void StructurizeCFG::createFlow() { /// no longer dominate all their uses. Not sure if this is really nessasary void StructurizeCFG::rebuildSSA() { SSAUpdater Updater; - for (Region::block_iterator I = ParentRegion->block_begin(), - E = ParentRegion->block_end(); - I != E; ++I) { - - BasicBlock *BB = *I; + for (const auto &BB : ParentRegion->blocks()) for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { bool Initialized = false; - for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { - - Next = I->getNext(); - - Instruction *User = cast<Instruction>(I->getUser()); + for (auto I = II->use_begin(), E = II->use_end(); I != E;) { + Use &U = *I++; + Instruction *User = cast<Instruction>(U.getUser()); if (User->getParent() == BB) { continue; } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(*I) == BB) + if (UserPN->getIncomingBlock(U) == BB) continue; } @@ -862,10 +848,9 @@ void StructurizeCFG::rebuildSSA() { Updater.AddAvailableValue(BB, II); Initialized = true; } - Updater.RewriteUseAfterInsertions(*I); + Updater.RewriteUseAfterInsertions(U); } } - } } /// \brief Run the transformation for each region found @@ -876,7 +861,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { Func = R->getEntry()->getParent(); ParentRegion = R; - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); orderNodes(); collectInfos(); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 9fb8ddc3d2c1..b7580255150c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -50,32 +50,35 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "tailcallelim" + STATISTIC(NumEliminated, "Number of tail calls removed"); STATISTIC(NumRetDuped, "Number of return duplicated"); STATISTIC(NumAccumAdded, "Number of accumulators introduced"); @@ -89,11 +92,14 @@ namespace { initializeTailCallElimPass(*PassRegistry::getPassRegistry()); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const; + void getAnalysisUsage(AnalysisUsage &AU) const override; - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; private: + bool runTRE(Function &F); + bool markTails(Function &F, bool &AllCallsAreTailCalls); + CallInst *FindTRECandidate(Instruction *I, bool CannotTailCallElimCallsMarkedTail); bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, @@ -131,52 +137,253 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfo>(); } -/// CanTRE - Scan the specified basic block for alloca instructions. -/// If it contains any that are variable-sized or not in the entry block, -/// returns false. -static bool CanTRE(AllocaInst *AI) { - // Because of PR962, we don't TRE allocas outside the entry block. - - // If this alloca is in the body of the function, or if it is a variable - // sized allocation, we cannot tail call eliminate calls marked 'tail' - // with this mechanism. - BasicBlock *BB = AI->getParent(); - return BB == &BB->getParent()->getEntryBlock() && - isa<ConstantInt>(AI->getArraySize()); +/// \brief Scan the specified function for alloca instructions. +/// If it contains any dynamic allocas, returns false. +static bool CanTRE(Function &F) { + // Because of PR962, we don't TRE dynamic allocas. + for (auto &BB : F) { + for (auto &I : BB) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { + if (!AI->isStaticAlloca()) + return false; + } + } + } + + return true; } -namespace { -struct AllocaCaptureTracker : public CaptureTracker { - AllocaCaptureTracker() : Captured(false) {} +bool TailCallElim::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; - void tooManyUses() LLVM_OVERRIDE { Captured = true; } + bool AllCallsAreTailCalls = false; + bool Modified = markTails(F, AllCallsAreTailCalls); + if (AllCallsAreTailCalls) + Modified |= runTRE(F); + return Modified; +} - bool shouldExplore(Use *U) LLVM_OVERRIDE { - Value *V = U->getUser(); - if (isa<CallInst>(V) || isa<InvokeInst>(V)) - UsesAlloca.insert(V); - return true; +namespace { +struct AllocaDerivedValueTracker { + // Start at a root value and walk its use-def chain to mark calls that use the + // value or a derived value in AllocaUsers, and places where it may escape in + // EscapePoints. + void walk(Value *Root) { + SmallVector<Use *, 32> Worklist; + SmallPtrSet<Use *, 32> Visited; + + auto AddUsesToWorklist = [&](Value *V) { + for (auto &U : V->uses()) { + if (!Visited.insert(&U)) + continue; + Worklist.push_back(&U); + } + }; + + AddUsesToWorklist(Root); + + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + + switch (I->getOpcode()) { + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(I); + bool IsNocapture = !CS.isCallee(U) && + CS.doesNotCapture(CS.getArgumentNo(U)); + callUsesLocalStack(CS, IsNocapture); + if (IsNocapture) { + // If the alloca-derived argument is passed in as nocapture, then it + // can't propagate to the call's return. That would be capturing. + continue; + } + break; + } + case Instruction::Load: { + // The result of a load is not alloca-derived (unless an alloca has + // otherwise escaped, but this is a local analysis). + continue; + } + case Instruction::Store: { + if (U->getOperandNo() == 0) + EscapePoints.insert(I); + continue; // Stores have no users to analyze. + } + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + case Instruction::AddrSpaceCast: + break; + default: + EscapePoints.insert(I); + break; + } + + AddUsesToWorklist(I); + } } - bool captured(Use *U) LLVM_OVERRIDE { - if (isa<ReturnInst>(U->getUser())) - return false; - Captured = true; - return true; + void callUsesLocalStack(CallSite CS, bool IsNocapture) { + // Add it to the list of alloca users. + AllocaUsers.insert(CS.getInstruction()); + + // If it's nocapture then it can't capture this alloca. + if (IsNocapture) + return; + + // If it can write to memory, it can leak the alloca value. + if (!CS.onlyReadsMemory()) + EscapePoints.insert(CS.getInstruction()); } - bool Captured; - SmallPtrSet<const Value *, 16> UsesAlloca; + SmallPtrSet<Instruction *, 32> AllocaUsers; + SmallPtrSet<Instruction *, 32> EscapePoints; }; -} // end anonymous namespace +} -bool TailCallElim::runOnFunction(Function &F) { +bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { + if (F.callsFunctionThatReturnsTwice()) + return false; + AllCallsAreTailCalls = true; + + // The local stack holds all alloca instructions and all byval arguments. + AllocaDerivedValueTracker Tracker; + for (Argument &Arg : F.args()) { + if (Arg.hasByValAttr()) + Tracker.walk(&Arg); + } + for (auto &BB : F) { + for (auto &I : BB) + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + Tracker.walk(AI); + } + + bool Modified = false; + + // Track whether a block is reachable after an alloca has escaped. Blocks that + // contain the escaping instruction will be marked as being visited without an + // escaped alloca, since that is how the block began. + enum VisitType { + UNVISITED, + UNESCAPED, + ESCAPED + }; + DenseMap<BasicBlock *, VisitType> Visited; + + // We propagate the fact that an alloca has escaped from block to successor. + // Visit the blocks that are propagating the escapedness first. To do this, we + // maintain two worklists. + SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped; + + // We may enter a block and visit it thinking that no alloca has escaped yet, + // then see an escape point and go back around a loop edge and come back to + // the same block twice. Because of this, we defer setting tail on calls when + // we first encounter them in a block. Every entry in this list does not + // statically use an alloca via use-def chain analysis, but may find an alloca + // through other means if the block turns out to be reachable after an escape + // point. + SmallVector<CallInst *, 32> DeferredTails; + + BasicBlock *BB = &F.getEntryBlock(); + VisitType Escaped = UNESCAPED; + do { + for (auto &I : *BB) { + if (Tracker.EscapePoints.count(&I)) + Escaped = ESCAPED; + + CallInst *CI = dyn_cast<CallInst>(&I); + if (!CI || CI->isTailCall()) + continue; + + if (CI->doesNotAccessMemory()) { + // A call to a readnone function whose arguments are all things computed + // outside this function can be marked tail. Even if you stored the + // alloca address into a global, a readnone function can't load the + // global anyhow. + // + // Note that this runs whether we know an alloca has escaped or not. If + // it has, then we can't trust Tracker.AllocaUsers to be accurate. + bool SafeToTail = true; + for (auto &Arg : CI->arg_operands()) { + if (isa<Constant>(Arg.getUser())) + continue; + if (Argument *A = dyn_cast<Argument>(Arg.getUser())) + if (!A->hasByValAttr()) + continue; + SafeToTail = false; + break; + } + if (SafeToTail) { + emitOptimizationRemark( + F.getContext(), "tailcallelim", F, CI->getDebugLoc(), + "marked this readnone call a tail call candidate"); + CI->setTailCall(); + Modified = true; + continue; + } + } + + if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + DeferredTails.push_back(CI); + } else { + AllCallsAreTailCalls = false; + } + } + + for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) { + auto &State = Visited[SuccBB]; + if (State < Escaped) { + State = Escaped; + if (State == ESCAPED) + WorklistEscaped.push_back(SuccBB); + else + WorklistUnescaped.push_back(SuccBB); + } + } + + if (!WorklistEscaped.empty()) { + BB = WorklistEscaped.pop_back_val(); + Escaped = ESCAPED; + } else { + BB = nullptr; + while (!WorklistUnescaped.empty()) { + auto *NextBB = WorklistUnescaped.pop_back_val(); + if (Visited[NextBB] == UNESCAPED) { + BB = NextBB; + Escaped = UNESCAPED; + break; + } + } + } + } while (BB); + + for (CallInst *CI : DeferredTails) { + if (Visited[CI->getParent()] != ESCAPED) { + // If the escape point was part way through the block, calls after the + // escape point wouldn't have been put into DeferredTails. + emitOptimizationRemark(F.getContext(), "tailcallelim", F, + CI->getDebugLoc(), + "marked this call a tail call candidate"); + CI->setTailCall(); + Modified = true; + } else { + AllCallsAreTailCalls = false; + } + } + + return Modified; +} + +bool TailCallElim::runTRE(Function &F) { // If this function is a varargs function, we won't be able to PHI the args // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return false; TTI = &getAnalysis<TargetTransformInfo>(); - BasicBlock *OldEntry = 0; + BasicBlock *OldEntry = nullptr; bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; @@ -185,39 +392,23 @@ bool TailCallElim::runOnFunction(Function &F) { // marked with the 'tail' attribute, because doing so would cause the stack // size to increase (real TRE would deallocate variable sized allocas, TRE // doesn't). - bool CanTRETailMarkedCall = true; - - // Find calls that can be marked tail. - AllocaCaptureTracker ACT; - for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - CanTRETailMarkedCall &= CanTRE(AI); - PointerMayBeCaptured(AI, &ACT); - // If any allocas are captured, exit. - if (ACT.Captured) - return false; - } - } - } + bool CanTRETailMarkedCall = CanTRE(F); - // Second pass, change any tail recursive calls to loops. + // Change any tail recursive calls to loops. // // FIXME: The code generator produces really bad code when an 'escaping // alloca' is changed from being a static alloca to being a dynamic alloca. // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. - if (ACT.UsesAlloca.empty()) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { - bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, - TailCallsAreMarkedTail, ArgumentPHIs, - !CanTRETailMarkedCall); - MadeChange |= Change; - } + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall); + MadeChange |= Change; } } @@ -226,34 +417,13 @@ bool TailCallElim::runOnFunction(Function &F) { // with themselves. Check to see if we did and clean up our mess if so. This // occurs when a function passes an argument straight through to its tail // call. - if (!ArgumentPHIs.empty()) { - for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { - PHINode *PN = ArgumentPHIs[i]; - - // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN)) { - PN->replaceAllUsesWith(PNV); - PN->eraseFromParent(); - } - } - } + for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { + PHINode *PN = ArgumentPHIs[i]; - // At this point, we know that the function does not have any captured - // allocas. If additionally the function does not call setjmp, mark all calls - // in the function that do not access stack memory with the tail keyword. This - // implies ensuring that there does not exist any path from a call that takes - // in an alloca but does not capture it and the call which we wish to mark - // with "tail". - if (!F.callsFunctionThatReturnsTwice()) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (CallInst *CI = dyn_cast<CallInst>(I)) { - if (!ACT.UsesAlloca.count(CI)) { - CI->setTailCall(); - MadeChange = true; - } - } - } + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN)) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); } } @@ -340,11 +510,11 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { // static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { Function *F = CI->getParent()->getParent(); - Value *ReturnedValue = 0; + Value *ReturnedValue = nullptr; for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) { ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()); - if (RI == 0 || RI == IgnoreRI) continue; + if (RI == nullptr || RI == IgnoreRI) continue; // We can only perform this transformation if the value returned is // evaluatable at the start of the initial invocation of the function, @@ -352,10 +522,10 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { // Value *RetOp = RI->getOperand(0); if (!isDynamicConstant(RetOp, CI, RI)) - return 0; + return nullptr; if (ReturnedValue && RetOp != ReturnedValue) - return 0; // Cannot transform if differing values are returned. + return nullptr; // Cannot transform if differing values are returned. ReturnedValue = RetOp; } return ReturnedValue; @@ -367,23 +537,23 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { /// Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { - if (!I->isAssociative() || !I->isCommutative()) return 0; + if (!I->isAssociative() || !I->isCommutative()) return nullptr; assert(I->getNumOperands() == 2 && "Associative/commutative operations should have 2 args!"); // Exactly one operand should be the result of the call instruction. if ((I->getOperand(0) == CI && I->getOperand(1) == CI) || (I->getOperand(0) != CI && I->getOperand(1) != CI)) - return 0; + return nullptr; // The only user of this instruction we allow is a single return instruction. - if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back())) - return 0; + if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back())) + return nullptr; // Ok, now we have to check all of the other return instructions in this // function. If they return non-constants or differing values, then we cannot // transform the function safely. - return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI); + return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI); } static Instruction *FirstNonDbg(BasicBlock::iterator I) { @@ -399,11 +569,11 @@ TailCallElim::FindTRECandidate(Instruction *TI, Function *F = BB->getParent(); if (&BB->front() == TI) // Make sure there is something before the terminator. - return 0; + return nullptr; // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. - CallInst *CI = 0; + CallInst *CI = nullptr; BasicBlock::iterator BBI = TI; while (true) { CI = dyn_cast<CallInst>(BBI); @@ -411,14 +581,14 @@ TailCallElim::FindTRECandidate(Instruction *TI, break; if (BBI == BB->begin()) - return 0; // Didn't find a potential tail call. + return nullptr; // Didn't find a potential tail call. --BBI; } // If this call is marked as a tail call, and if there are dynamic allocas in // the function, we cannot perform this optimization. if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) - return 0; + return nullptr; // As a special case, detect code like this: // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call @@ -426,7 +596,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && FirstNonDbg(BB->front()) == CI && - FirstNonDbg(llvm::next(BB->begin())) == TI && + FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && !TTI->isLoweredToCall(CI->getCalledFunction())) { // A single-block function with just a call and a return. Check that @@ -438,7 +608,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, for (; I != E && FI != FE; ++I, ++FI) if (*I != &*FI) break; if (I == E && FI == FE) - return 0; + return nullptr; } return CI; @@ -459,8 +629,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // which is different to the constant returned by other return instructions // (which is recorded in AccumulatorRecursionEliminationInitVal). This is a // special case of accumulator recursion, the operation being "return C". - Value *AccumulatorRecursionEliminationInitVal = 0; - Instruction *AccumulatorRecursionInstr = 0; + Value *AccumulatorRecursionEliminationInitVal = nullptr; + Instruction *AccumulatorRecursionInstr = nullptr; // Ok, we found a potential tail call. We can currently only transform the // tail call if all of the instructions between the call and the return are @@ -490,8 +660,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // accumulator recursion variable eliminated. if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI && !isa<UndefValue>(Ret->getReturnValue()) && - AccumulatorRecursionEliminationInitVal == 0 && - !getCommonReturnValue(0, CI)) { + AccumulatorRecursionEliminationInitVal == nullptr && + !getCommonReturnValue(nullptr, CI)) { // One case remains that we are able to handle: the current return // instruction returns a constant, and all other return instructions // return a different constant. @@ -507,9 +677,12 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *BB = Ret->getParent(); Function *F = BB->getParent(); + emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(), + "transforming tail recursion to loop"); + // OK! We can transform this tail call. If this is the first one found, // create the new entry block, allowing us to branch back to the old entry. - if (OldEntry == 0) { + if (!OldEntry) { OldEntry = &F->getEntryBlock(); BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry); NewEntry->takeName(OldEntry); |