diff options
Diffstat (limited to 'lib/Transforms')
43 files changed, 2074 insertions, 1959 deletions
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index 40a87e880d7b..89f213e2ac36 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -64,7 +64,7 @@ namespace { CallGraphSCCPass::getAnalysisUsage(AU); } - virtual bool runOnSCC(std::vector<CallGraphNode *> &SCC); + virtual bool runOnSCC(CallGraphSCC &SCC); static char ID; // Pass identification, replacement for typeid explicit ArgPromotion(unsigned maxElements = 3) : CallGraphSCCPass(&ID), maxElements(maxElements) {} @@ -91,20 +91,21 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { return new ArgPromotion(maxElements); } -bool ArgPromotion::runOnSCC(std::vector<CallGraphNode *> &SCC) { +bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { bool Changed = false, LocalChange; do { // Iterate until we stop promoting from this SCC. LocalChange = false; // Attempt to promote arguments from all functions in this SCC. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - if (CallGraphNode *CGN = PromoteArguments(SCC[i])) { + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + if (CallGraphNode *CGN = PromoteArguments(*I)) { LocalChange = true; - SCC[i] = CGN; + SCC.ReplaceNode(*I, CGN); } + } Changed |= LocalChange; // Remember that we changed something. } while (LocalChange); - + return Changed; } @@ -873,8 +874,14 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, NF_CGN->stealCalledFunctionsFrom(CG[F]); - // Now that the old function is dead, delete it. - delete CG.removeFunctionFromModule(F); + // Now that the old function is dead, delete it. If there is a dangling + // reference to the CallgraphNode, just leave the dead function around for + // someone else to nuke. + CallGraphNode *CGN = CG[F]; + if (CGN->getNumReferences() == 0) + delete CG.removeFunctionFromModule(CGN); + else + F->setLinkage(Function::ExternalLinkage); return NF_CGN; } diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 92bef3bb75e9..65483e8fed63 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -23,3 +23,5 @@ add_llvm_library(LLVMipo StripSymbols.cpp StructRetPromotion.cpp ) + +target_link_libraries (LLVMipo LLVMScalarOpts LLVMInstCombine) diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 227602d19f56..6443dd4f47a6 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -243,6 +243,9 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { if (cast<CallInst>(Call)->isTailCall()) cast<CallInst>(New)->setTailCall(); } + if (MDNode *N = Call->getDbgMetadata()) + New->setDbgMetadata(N); + Args.clear(); if (!Call->use_empty()) @@ -694,18 +697,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()); - // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which - // have zero fixed arguments. - // - // Note that we apply this hack for a vararg fuction that does not have any - // arguments anymore, but did have them before (so don't bother fixing - // functions that were already broken wrt CWriter). - bool ExtraArgHack = false; - if (Params.empty() && FTy->isVarArg() && FTy->getNumParams() != 0) { - ExtraArgHack = true; - Params.push_back(Type::getInt32Ty(F->getContext())); - } - // Create the new function type based on the recomputed parameters. FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); @@ -755,9 +746,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); } - if (ExtraArgHack) - Args.push_back(UndefValue::get(Type::getInt32Ty(F->getContext()))); - // Push any varargs arguments on the list. Don't forget their attributes. for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) { Args.push_back(*I); @@ -785,6 +773,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (cast<CallInst>(Call)->isTailCall()) cast<CallInst>(New)->setTailCall(); } + if (MDNode *N = Call->getDbgMetadata()) + New->setDbgMetadata(N); + Args.clear(); if (!Call->use_empty()) { diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 298d5cf39162..9bd7af61c531 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -44,20 +44,20 @@ namespace { FunctionAttrs() : CallGraphSCCPass(&ID) {} // runOnSCC - Analyze the SCC, performing the transformation if possible. - bool runOnSCC(std::vector<CallGraphNode *> &SCC); + bool runOnSCC(CallGraphSCC &SCC); // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. - bool AddReadAttrs(const std::vector<CallGraphNode *> &SCC); + bool AddReadAttrs(const CallGraphSCC &SCC); // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC. - bool AddNoCaptureAttrs(const std::vector<CallGraphNode *> &SCC); + bool AddNoCaptureAttrs(const CallGraphSCC &SCC); // IsFunctionMallocLike - Does this function allocate new memory? bool IsFunctionMallocLike(Function *F, SmallPtrSet<Function*, 8> &) const; // AddNoAliasAttrs - Deduce noalias attributes for the SCC. - bool AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC); + bool AddNoAliasAttrs(const CallGraphSCC &SCC); virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); @@ -123,19 +123,19 @@ bool FunctionAttrs::PointsToLocalMemory(Value *V) { } /// AddReadAttrs - Deduce readonly/readnone attributes for the SCC. -bool FunctionAttrs::AddReadAttrs(const std::vector<CallGraphNode *> &SCC) { +bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { SmallPtrSet<Function*, 8> SCCNodes; // Fill SCCNodes with the elements of the SCC. Used for quickly // looking up whether a given CallGraphNode is in this SCC. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - SCCNodes.insert(SCC[i]->getFunction()); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + SCCNodes.insert((*I)->getFunction()); // Check if any of the functions in the SCC read or write memory. If they // write memory then they can't be marked readnone or readonly. bool ReadsMemory = false; - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (F == 0) // External node - may write memory. Just give up. @@ -210,8 +210,8 @@ bool FunctionAttrs::AddReadAttrs(const std::vector<CallGraphNode *> &SCC) { // Success! Functions in this SCC do not access memory, or only read memory. // Give them the appropriate attribute. bool MadeChange = false; - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (F->doesNotAccessMemory()) // Already perfect! @@ -239,13 +239,13 @@ bool FunctionAttrs::AddReadAttrs(const std::vector<CallGraphNode *> &SCC) { } /// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC. -bool FunctionAttrs::AddNoCaptureAttrs(const std::vector<CallGraphNode *> &SCC) { +bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { bool Changed = false; // Check each function in turn, determining which pointer arguments are not // captured. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (F == 0) // External node - skip it; @@ -334,18 +334,18 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, } /// AddNoAliasAttrs - Deduce noalias attributes for the SCC. -bool FunctionAttrs::AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC) { +bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { SmallPtrSet<Function*, 8> SCCNodes; // Fill SCCNodes with the elements of the SCC. Used for quickly // looking up whether a given CallGraphNode is in this SCC. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - SCCNodes.insert(SCC[i]->getFunction()); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + SCCNodes.insert((*I)->getFunction()); // Check each function in turn, determining which functions return noalias // pointers. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (F == 0) // External node - skip it; @@ -370,8 +370,8 @@ bool FunctionAttrs::AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC) { } bool MadeChange = false; - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy()) continue; @@ -383,7 +383,7 @@ bool FunctionAttrs::AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC) { return MadeChange; } -bool FunctionAttrs::runOnSCC(std::vector<CallGraphNode *> &SCC) { +bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { bool Changed = AddReadAttrs(SCC); Changed |= AddNoCaptureAttrs(SCC); Changed |= AddNoAliasAttrs(SCC); diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index ddff5ef8b36c..b429213a7db3 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -143,7 +143,8 @@ struct GlobalStatus { static bool SafeToDestroyConstant(const Constant *C) { if (isa<GlobalValue>(C)) return false; - for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; ++UI) + for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; + ++UI) if (const Constant *CU = dyn_cast<Constant>(*UI)) { if (!SafeToDestroyConstant(CU)) return false; } else @@ -158,7 +159,8 @@ static bool SafeToDestroyConstant(const Constant *C) { /// static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, SmallPtrSet<const PHINode*, 16> &PHIUsers) { - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI) + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) { GS.HasNonInstructionUser = true; @@ -185,7 +187,8 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, // value, not an aggregate), keep more specific information about // stores. if (GS.StoredType != GlobalStatus::isStored) { - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(SI->getOperand(1))){ + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( + SI->getOperand(1))) { Value *StoredVal = SI->getOperand(0); if (StoredVal == GV->getInitializer()) { if (GS.StoredType < GlobalStatus::isInitializerStored) @@ -610,62 +613,69 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { /// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified /// value will trap if the value is dynamically null. PHIs keeps track of any /// phi nodes we've seen to avoid reprocessing them. -static bool AllUsesOfValueWillTrapIfNull(Value *V, - SmallPtrSet<PHINode*, 8> &PHIs) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI) - if (isa<LoadInst>(*UI)) { +static bool AllUsesOfValueWillTrapIfNull(const Value *V, + SmallPtrSet<const PHINode*, 8> &PHIs) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { + const User *U = *UI; + + if (isa<LoadInst>(U)) { // Will trap. - } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) { + } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getOperand(0) == V) { - //cerr << "NONTRAPPING USE: " << **UI; + //cerr << "NONTRAPPING USE: " << *U; return false; // Storing the value. } - } else if (CallInst *CI = dyn_cast<CallInst>(*UI)) { + } else if (const CallInst *CI = dyn_cast<CallInst>(U)) { if (CI->getCalledValue() != V) { - //cerr << "NONTRAPPING USE: " << **UI; + //cerr << "NONTRAPPING USE: " << *U; return false; // Not calling the ptr } - } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) { + } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) { if (II->getCalledValue() != V) { - //cerr << "NONTRAPPING USE: " << **UI; + //cerr << "NONTRAPPING USE: " << *U; return false; // Not calling the ptr } - } else if (BitCastInst *CI = dyn_cast<BitCastInst>(*UI)) { + } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) { if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false; - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI)) { + } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false; - } else if (PHINode *PN = dyn_cast<PHINode>(*UI)) { + } else if (const PHINode *PN = dyn_cast<PHINode>(U)) { // If we've already seen this phi node, ignore it, it has already been // checked. if (PHIs.insert(PN) && !AllUsesOfValueWillTrapIfNull(PN, PHIs)) return false; - } else if (isa<ICmpInst>(*UI) && + } else if (isa<ICmpInst>(U) && isa<ConstantPointerNull>(UI->getOperand(1))) { // Ignore icmp X, null } else { - //cerr << "NONTRAPPING USE: " << **UI; + //cerr << "NONTRAPPING USE: " << *U; return false; } + } return true; } /// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads /// from GV will trap if the loaded value is null. Note that this also permits /// comparisons of the loaded value against null, as a special case. -static bool AllUsesOfLoadedValueWillTrapIfNull(GlobalVariable *GV) { - for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI!=E; ++UI) - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) { - SmallPtrSet<PHINode*, 8> PHIs; +static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { + for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) { + const User *U = *UI; + + if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + SmallPtrSet<const PHINode*, 8> PHIs; if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) return false; - } else if (isa<StoreInst>(*UI)) { + } else if (isa<StoreInst>(U)) { // Ignore stores to the global. } else { // We don't know or understand this user, bail out. - //cerr << "UNKNOWN USER OF GLOBAL!: " << **UI; + //cerr << "UNKNOWN USER OF GLOBAL!: " << *U; return false; } - + } return true; } @@ -682,16 +692,17 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { Changed = true; } } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) { - if (I->getOperand(0) == V) { + CallSite CS(I); + if (CS.getCalledValue() == V) { // Calling through the pointer! Turn into a direct call, but be careful // that the pointer is not also being passed as an argument. - I->setOperand(0, NewV); + CS.setCalledFunction(NewV); Changed = true; bool PassedAsArg = false; - for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i) - if (I->getOperand(i) == V) { + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.getArgument(i) == V) { PassedAsArg = true; - I->setOperand(i, NewV); + CS.setArgument(i, NewV); } if (PassedAsArg) { @@ -938,29 +949,31 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, /// to make sure that there are no complex uses of V. We permit simple things /// like dereferencing the pointer, but not storing through the address, unless /// it is to the specified global. -static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Instruction *V, - GlobalVariable *GV, - SmallPtrSet<PHINode*, 8> &PHIs) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ - Instruction *Inst = cast<Instruction>(*UI); - +static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, + const GlobalVariable *GV, + SmallPtrSet<const PHINode*, 8> &PHIs) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + const Instruction *Inst = cast<Instruction>(*UI); + if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) { continue; // Fine, ignore. } - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (SI->getOperand(0) == V && SI->getOperand(1) != GV) return false; // Storing the pointer itself... bad. continue; // Otherwise, storing through it, or storing into GV... fine. } - if (isa<GetElementPtrInst>(Inst)) { + // Must index into the array and into the struct. + if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) { if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs)) return false; continue; } - if (PHINode *PN = dyn_cast<PHINode>(Inst)) { + if (const PHINode *PN = dyn_cast<PHINode>(Inst)) { // PHIs are ok if all uses are ok. Don't infinitely recurse through PHI // cycles. if (PHIs.insert(PN)) @@ -969,7 +982,7 @@ static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Instruction *V, continue; } - if (BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) { + if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) { if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs)) return false; continue; @@ -1029,11 +1042,12 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, /// of a load) are simple enough to perform heap SRA on. This permits GEP's /// that index through the array and struct field, icmps of null, and PHIs. static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, - SmallPtrSet<const PHINode*, 32> &LoadUsingPHIs, - SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) { + SmallPtrSet<const PHINode*, 32> &LoadUsingPHIs, + SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) { // We permit two users of the load: setcc comparing against the null // pointer, and a getelementptr of a specific form. - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { const Instruction *User = cast<Instruction>(*UI); // Comparison against null is ok. @@ -1084,8 +1098,8 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, Instruction *StoredVal) { SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad; - for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E; - ++UI) + for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) { if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs, LoadUsingPHIsPerLoad)) @@ -1098,8 +1112,8 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, // that all inputs the to the PHI nodes are in the same equivalence sets. // Check to verify that all operands of the PHIs are either PHIS that can be // transformed, loads from GV, or MI itself. - for (SmallPtrSet<const PHINode*, 32>::const_iterator I = LoadUsingPHIs.begin(), - E = LoadUsingPHIs.end(); I != E; ++I) { + for (SmallPtrSet<const PHINode*, 32>::const_iterator I = LoadUsingPHIs.begin() + , E = LoadUsingPHIs.end(); I != E; ++I) { const PHINode *PN = *I; for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) { Value *InVal = PN->getIncomingValue(op); @@ -1448,6 +1462,9 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, const Type *AllocTy, Module::global_iterator &GVI, TargetData *TD) { + if (!TD) + return false; + // If this is a malloc of an abstract type, don't touch it. if (!AllocTy->isSized()) return false; @@ -1466,66 +1483,66 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // malloc to be stored into the specified global, loaded setcc'd, and // GEP'd. These are all things we could transform to using the global // for. - { - SmallPtrSet<PHINode*, 8> PHIs; - if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs)) - return false; - } + SmallPtrSet<const PHINode*, 8> PHIs; + if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs)) + return false; // If we have a global that is only initialized with a fixed size malloc, // transform the program to use global memory instead of malloc'd memory. // This eliminates dynamic allocation, avoids an indirection accessing the // data, and exposes the resultant global to further GlobalOpt. // We cannot optimize the malloc if we cannot determine malloc array size. - if (Value *NElems = getMallocArraySize(CI, TD, true)) { - if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) - // Restrict this transformation to only working on small allocations - // (2048 bytes currently), as we don't want to introduce a 16M global or - // something. - if (TD && - NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) { - GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD); - return true; - } + Value *NElems = getMallocArraySize(CI, TD, true); + if (!NElems) + return false; + + if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) + // Restrict this transformation to only working on small allocations + // (2048 bytes currently), as we don't want to introduce a 16M global or + // something. + if (NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) { + GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD); + return true; + } - // If the allocation is an array of structures, consider transforming this - // into multiple malloc'd arrays, one for each field. This is basically - // SRoA for malloc'd memory. - - // If this is an allocation of a fixed size array of structs, analyze as a - // variable size array. malloc [100 x struct],1 -> malloc struct, 100 - if (NElems == ConstantInt::get(CI->getOperand(1)->getType(), 1)) - if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy)) - AllocTy = AT->getElementType(); + // If the allocation is an array of structures, consider transforming this + // into multiple malloc'd arrays, one for each field. This is basically + // SRoA for malloc'd memory. + + // If this is an allocation of a fixed size array of structs, analyze as a + // variable size array. malloc [100 x struct],1 -> malloc struct, 100 + if (NElems == ConstantInt::get(CI->getOperand(1)->getType(), 1)) + if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy)) + AllocTy = AT->getElementType(); - if (const StructType *AllocSTy = dyn_cast<StructType>(AllocTy)) { - // This the structure has an unreasonable number of fields, leave it - // alone. - if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 && - AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) { - - // If this is a fixed size array, transform the Malloc to be an alloc of - // structs. malloc [100 x struct],1 -> malloc struct, 100 - if (const ArrayType *AT = - dyn_cast<ArrayType>(getMallocAllocatedType(CI))) { - const Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); - unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes(); - Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); - Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); - Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, - AllocSize, NumElements, - CI->getName()); - Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); - CI->replaceAllUsesWith(Cast); - CI->eraseFromParent(); - CI = dyn_cast<BitCastInst>(Malloc) ? - extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc); - } - - GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true),TD); - return true; - } + const StructType *AllocSTy = dyn_cast<StructType>(AllocTy); + if (!AllocSTy) + return false; + + // This the structure has an unreasonable number of fields, leave it + // alone. + if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 && + AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) { + + // If this is a fixed size array, transform the Malloc to be an alloc of + // structs. malloc [100 x struct],1 -> malloc struct, 100 + if (const ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) { + const Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes(); + Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); + Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); + Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, + AllocSize, NumElements, + CI->getName()); + Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); + CI->replaceAllUsesWith(Cast); + CI->eraseFromParent(); + CI = dyn_cast<BitCastInst>(Malloc) ? + extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc); } + + GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true),TD); + return true; } return false; @@ -1689,8 +1706,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (GS.StoredType == GlobalStatus::isStoredOnce && GS.StoredOnceValue) DEBUG(dbgs() << " StoredOnceValue = " << *GS.StoredOnceValue << "\n"); if (GS.AccessingFunction && !GS.HasMultipleAccessingFunctions) - DEBUG(dbgs() << " AccessingFunction = " << GS.AccessingFunction->getName() - << "\n"); + DEBUG(dbgs() << " AccessingFunction = " + << GS.AccessingFunction->getName() << "\n"); DEBUG(dbgs() << " HasMultipleAccessingFunctions = " << GS.HasMultipleAccessingFunctions << "\n"); DEBUG(dbgs() << " HasNonInstructionUser = " @@ -2278,10 +2295,10 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal, } // Cannot handle inline asm. - if (isa<InlineAsm>(CI->getOperand(0))) return false; + if (isa<InlineAsm>(CI->getCalledValue())) return false; // Resolve function pointers. - Function *Callee = dyn_cast<Function>(getVal(Values, CI->getOperand(0))); + Function *Callee = dyn_cast<Function>(getVal(Values, CI->getCalledValue())); if (!Callee) return false; // Cannot resolve. SmallVector<Constant*, 8> Formals; @@ -2500,7 +2517,7 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { continue; // Do not perform the transform if multiple aliases potentially target the - // aliasee. This check also ensures that it is safe to replace the section + // aliasee. This check also ensures that it is safe to replace the section // and other attributes of the aliasee with those of the alias. if (!hasOneUse) continue; diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp index 83e8624fe09d..340b70eb0268 100644 --- a/lib/Transforms/IPO/IPO.cpp +++ b/lib/Transforms/IPO/IPO.cpp @@ -62,6 +62,15 @@ void LLVMAddPruneEHPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPruneEHPass()); } +void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createIPSCCPPass()); +} + +void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) { + unwrap(PM)->add(createInternalizePass(AllButMain != 0)); +} + + void LLVMAddRaiseAllocationsPass(LLVMPassManagerRef PM) { // FIXME: Remove in LLVM 3.0. } diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 03ec72c03043..b785bb0a9390 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -73,16 +73,14 @@ InlinedArrayAllocasTy; /// available from other functions inlined into the caller. If we are able to /// inline this call site we attempt to reuse already available allocas or add /// any new allocas to the set if not possible. -static bool InlineCallIfPossible(CallSite CS, CallGraph &CG, - const TargetData *TD, +static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, InlinedArrayAllocasTy &InlinedArrayAllocas) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); // Try to inline the function. Get the list of static allocas that were // inlined. - SmallVector<AllocaInst*, 16> StaticAllocas; - if (!InlineFunction(CS, &CG, TD, &StaticAllocas)) + if (!InlineFunction(CS, IFI)) return false; // If the inlined function had a higher stack protection level than the @@ -119,9 +117,9 @@ static bool InlineCallIfPossible(CallSite CS, CallGraph &CG, // Loop over all the allocas we have so far and see if they can be merged with // a previously inlined alloca. If not, remember that we had it. - for (unsigned AllocaNo = 0, e = StaticAllocas.size(); + for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size(); AllocaNo != e; ++AllocaNo) { - AllocaInst *AI = StaticAllocas[AllocaNo]; + AllocaInst *AI = IFI.StaticAllocas[AllocaNo]; // Don't bother trying to merge array allocations (they will usually be // canonicalized to be an allocation *of* an array), or allocations whose @@ -292,14 +290,29 @@ bool Inliner::shouldInline(CallSite CS) { return true; } -bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) { +/// InlineHistoryIncludes - Return true if the specified inline history ID +/// indicates an inline history that includes the specified function. +static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, + const SmallVectorImpl<std::pair<Function*, int> > &InlineHistory) { + while (InlineHistoryID != -1) { + assert(unsigned(InlineHistoryID) < InlineHistory.size() && + "Invalid inline history ID"); + if (InlineHistory[InlineHistoryID].first == F) + return true; + InlineHistoryID = InlineHistory[InlineHistoryID].second; + } + return false; +} + + +bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraph>(); const TargetData *TD = getAnalysisIfAvailable<TargetData>(); SmallPtrSet<Function*, 8> SCCFunctions; DEBUG(dbgs() << "Inliner visiting SCC:"); - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (F) SCCFunctions.insert(F); DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE")); } @@ -307,10 +320,16 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) { // Scan through and identify all call sites ahead of time so that we only // inline call sites in the original functions, not call sites that result // from inlining other functions. - SmallVector<CallSite, 16> CallSites; - - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { - Function *F = SCC[i]->getFunction(); + SmallVector<std::pair<CallSite, int>, 16> CallSites; + + // When inlining a callee produces new call sites, we want to keep track of + // the fact that they were inlined from the callee. This allows us to avoid + // infinite inlining in some obscure cases. To represent this, we use an + // index into the InlineHistory vector. + SmallVector<std::pair<Function*, int>, 8> InlineHistory; + + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); if (!F) continue; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) @@ -327,22 +346,27 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) { if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration()) continue; - CallSites.push_back(CS); + CallSites.push_back(std::make_pair(CS, -1)); } } DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n"); + // If there are no calls in this function, exit early. + if (CallSites.empty()) + return false; + // Now that we have all of the call sites, move the ones to functions in the // current SCC to the end of the list. unsigned FirstCallInSCC = CallSites.size(); for (unsigned i = 0; i < FirstCallInSCC; ++i) - if (Function *F = CallSites[i].getCalledFunction()) + if (Function *F = CallSites[i].first.getCalledFunction()) if (SCCFunctions.count(F)) std::swap(CallSites[i--], CallSites[--FirstCallInSCC]); InlinedArrayAllocasTy InlinedArrayAllocas; + InlineFunctionInfo InlineInfo(&CG, TD); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -353,7 +377,7 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) { // Iterate over the outer loop because inlining functions can cause indirect // calls to become direct calls. for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { - CallSite CS = CallSites[CSi]; + CallSite CS = CallSites[CSi].first; Function *Caller = CS.getCaller(); Function *Callee = CS.getCalledFunction(); @@ -375,16 +399,42 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) { // We can only inline direct calls to non-declarations. if (Callee == 0 || Callee->isDeclaration()) continue; + // If this call sites was obtained by inlining another function, verify + // that the include path for the function did not include the callee + // itself. If so, we'd be recursively inlinling the same function, + // which would provide the same callsites, which would cause us to + // infinitely inline. + int InlineHistoryID = CallSites[CSi].second; + if (InlineHistoryID != -1 && + InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) + continue; + + // If the policy determines that we should inline this function, // try to do so. if (!shouldInline(CS)) continue; - // Attempt to inline the function... - if (!InlineCallIfPossible(CS, CG, TD, InlinedArrayAllocas)) + // Attempt to inline the function. + if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas)) continue; ++NumInlined; - + + // If inlining this function gave us any new call sites, throw them + // onto our worklist to process. They are useful inline candidates. + if (!InlineInfo.InlinedCalls.empty()) { + // Create a new inline history entry for this, so that we remember + // that these new callsites came about due to inlining Callee. + int NewHistoryID = InlineHistory.size(); + InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID)); + + for (unsigned i = 0, e = InlineInfo.InlinedCalls.size(); + i != e; ++i) { + Value *Ptr = InlineInfo.InlinedCalls[i]; + CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID)); + } + } + // Update the cached cost info with the inlined call. growCachedCostInfo(Caller, Callee); } @@ -417,7 +467,7 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) { // swap/pop_back for efficiency, but do not use it if doing so would // move a call site to a function in this SCC before the // 'FirstCallInSCC' barrier. - if (SCC.size() == 1) { + if (SCC.isSingular()) { std::swap(CallSites[CSi], CallSites.back()); CallSites.pop_back(); } else { diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index f8ec72227380..07525eaada5e 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -120,15 +120,17 @@ Function* PartialInliner::unswitchFunction(Function* F) { // Extract the body of the if. Function* extractedFunction = ExtractCodeRegion(DT, toExtract); + InlineFunctionInfo IFI; + // Inline the top-level if test into all callers. std::vector<User*> Users(duplicateFunction->use_begin(), duplicateFunction->use_end()); for (std::vector<User*>::iterator UI = Users.begin(), UE = Users.end(); UI != UE; ++UI) - if (CallInst* CI = dyn_cast<CallInst>(*UI)) - InlineFunction(CI); - else if (InvokeInst* II = dyn_cast<InvokeInst>(*UI)) - InlineFunction(II); + if (CallInst *CI = dyn_cast<CallInst>(*UI)) + InlineFunction(CI, IFI); + else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) + InlineFunction(II, IFI); // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index 161246bc2598..de6099cc1daa 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -40,7 +40,7 @@ namespace { PruneEH() : CallGraphSCCPass(&ID) {} // runOnSCC - Analyze the SCC, performing the transformation if possible. - bool runOnSCC(std::vector<CallGraphNode *> &SCC); + bool runOnSCC(CallGraphSCC &SCC); bool SimplifyFunction(Function *F); void DeleteBasicBlock(BasicBlock *BB); @@ -54,20 +54,20 @@ X("prune-eh", "Remove unused exception handling info"); Pass *llvm::createPruneEHPass() { return new PruneEH(); } -bool PruneEH::runOnSCC(std::vector<CallGraphNode *> &SCC) { +bool PruneEH::runOnSCC(CallGraphSCC &SCC) { SmallPtrSet<CallGraphNode *, 8> SCCNodes; CallGraph &CG = getAnalysis<CallGraph>(); bool MadeChange = false; // Fill SCCNodes with the elements of the SCC. Used for quickly // looking up whether a given CallGraphNode is in this SCC. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - SCCNodes.insert(SCC[i]); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + SCCNodes.insert(*I); // First pass, scan all of the functions in the SCC, simplifying them // according to what we know. - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - if (Function *F = SCC[i]->getFunction()) + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + if (Function *F = (*I)->getFunction()) MadeChange |= SimplifyFunction(F); // Next, check to see if any callees might throw or if there are any external @@ -78,9 +78,9 @@ bool PruneEH::runOnSCC(std::vector<CallGraphNode *> &SCC) { // obviously the SCC might throw. // bool SCCMightUnwind = false, SCCMightReturn = false; - for (unsigned i = 0, e = SCC.size(); - (!SCCMightUnwind || !SCCMightReturn) && i != e; ++i) { - Function *F = SCC[i]->getFunction(); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); + (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) { + Function *F = (*I)->getFunction(); if (F == 0) { SCCMightUnwind = true; SCCMightReturn = true; @@ -132,7 +132,7 @@ bool PruneEH::runOnSCC(std::vector<CallGraphNode *> &SCC) { // If the SCC doesn't unwind or doesn't throw, note this fact. if (!SCCMightUnwind || !SCCMightReturn) - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { Attributes NewAttributes = Attribute::None; if (!SCCMightUnwind) @@ -140,19 +140,20 @@ bool PruneEH::runOnSCC(std::vector<CallGraphNode *> &SCC) { if (!SCCMightReturn) NewAttributes |= Attribute::NoReturn; - const AttrListPtr &PAL = SCC[i]->getFunction()->getAttributes(); + Function *F = (*I)->getFunction(); + const AttrListPtr &PAL = F->getAttributes(); const AttrListPtr &NPAL = PAL.addAttr(~0, NewAttributes); if (PAL != NPAL) { MadeChange = true; - SCC[i]->getFunction()->setAttributes(NPAL); + F->setAttributes(NPAL); } } - for (unsigned i = 0, e = SCC.size(); i != e; ++i) { + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { // Convert any invoke instructions to non-throwing functions in this node // into call instructions with a branch. This makes the exception blocks // dead. - if (Function *F = SCC[i]->getFunction()) + if (Function *F = (*I)->getFunction()) MadeChange |= SimplifyFunction(F); } diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp index dda32d02c873..473e83cec45e 100644 --- a/lib/Transforms/IPO/StructRetPromotion.cpp +++ b/lib/Transforms/IPO/StructRetPromotion.cpp @@ -48,7 +48,7 @@ namespace { CallGraphSCCPass::getAnalysisUsage(AU); } - virtual bool runOnSCC(std::vector<CallGraphNode *> &SCC); + virtual bool runOnSCC(CallGraphSCC &SCC); static char ID; // Pass identification, replacement for typeid SRETPromotion() : CallGraphSCCPass(&ID) {} @@ -69,12 +69,12 @@ Pass *llvm::createStructRetPromotionPass() { return new SRETPromotion(); } -bool SRETPromotion::runOnSCC(std::vector<CallGraphNode *> &SCC) { +bool SRETPromotion::runOnSCC(CallGraphSCC &SCC) { bool Changed = false; - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - if (CallGraphNode *NewNode = PromoteReturn(SCC[i])) { - SCC[i] = NewNode; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + if (CallGraphNode *NewNode = PromoteReturn(*I)) { + SCC.ReplaceNode(*I, NewNode); Changed = true; } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3fb3de75075e..8586054fce0e 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1735,16 +1735,12 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - if (RHS->isOne() && Op0->hasOneUse()) { + if (RHS->isOne() && Op0->hasOneUse()) // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B - if (ICmpInst *ICI = dyn_cast<ICmpInst>(Op0)) - return new ICmpInst(ICI->getInversePredicate(), - ICI->getOperand(0), ICI->getOperand(1)); - - if (FCmpInst *FCI = dyn_cast<FCmpInst>(Op0)) - return new FCmpInst(FCI->getInversePredicate(), - FCI->getOperand(0), FCI->getOperand(1)); - } + if (CmpInst *CI = dyn_cast<CmpInst>(Op0)) + return CmpInst::Create(CI->getOpcode(), + CI->getInversePredicate(), + CI->getOperand(0), CI->getOperand(1)); // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp). if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index e025b053765b..38e7b6ec2d1f 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -59,29 +59,32 @@ static unsigned EnforceKnownAlignment(Value *V, // Treat this like a bitcast. return EnforceKnownAlignment(U->getOperand(0), Align, PrefAlign); } - break; + return Align; + } + case Instruction::Alloca: { + AllocaInst *AI = cast<AllocaInst>(V); + // If there is a requested alignment and if this is an alloca, round up. + if (AI->getAlignment() >= PrefAlign) + return AI->getAlignment(); + AI->setAlignment(PrefAlign); + return PrefAlign; } } if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) { // If there is a large requested alignment and we can, bump up the alignment // of the global. - if (!GV->isDeclaration()) { - if (GV->getAlignment() >= PrefAlign) - Align = GV->getAlignment(); - else { - GV->setAlignment(PrefAlign); - Align = PrefAlign; - } - } - } else if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) { - // If there is a requested alignment and if this is an alloca, round up. - if (AI->getAlignment() >= PrefAlign) - Align = AI->getAlignment(); - else { - AI->setAlignment(PrefAlign); - Align = PrefAlign; - } + if (GV->isDeclaration()) return Align; + + if (GV->getAlignment() >= PrefAlign) + return GV->getAlignment(); + // We can only increase the alignment of the global if it has no alignment + // specified or if it is not assigned a section. If it is assigned a + // section, the global could be densely packed with other objects in the + // section, increasing the alignment could cause padding issues. + if (!GV->hasSection() || GV->getAlignment() == 0) + GV->setAlignment(PrefAlign); + return GV->getAlignment(); } return Align; @@ -287,7 +290,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { const Type *Tys[3] = { CI.getOperand(1)->getType(), CI.getOperand(2)->getType(), CI.getOperand(3)->getType() }; - CI.setOperand(0, + CI.setCalledFunction( Intrinsic::getDeclaration(M, MemCpyID, Tys, 3)); Changed = true; } @@ -526,7 +529,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // X + 0 -> {X, false} if (RHS->isZero()) { Constant *V[] = { - UndefValue::get(II->getOperand(0)->getType()), + UndefValue::get(II->getCalledValue()->getType()), ConstantInt::getFalse(II->getContext()) }; Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index a68fc6df4768..eb7628e741bd 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1323,7 +1323,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) { // Okay, we have (bitcast (shuffle ..)). Check to see if this is - // a bitconvert to a vector with the same # elts. + // a bitcast to a vector with the same # elts. if (SVI->hasOneUse() && DestTy->isVectorTy() && cast<VectorType>(DestTy)->getNumElements() == SVI->getType()->getNumElements() && diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 72fd5588d120..861cf92d281e 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1325,7 +1325,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one. if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){ const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue(); - if (V.sgt(APInt(V.getBitWidth(), 1)) && V.isPowerOf2()) { + if (V.sgt(1) && V.isPowerOf2()) { Value *NewRem = Builder->CreateURem(BO->getOperand(0), BO->getOperand(1), BO->getName()); diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 2fc932594f7b..c958cde0917b 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -13,6 +13,7 @@ #include "InstCombine.h" #include "llvm/Support/PatternMatch.h" +#include "llvm/Analysis/InstructionSimplify.h" using namespace llvm; using namespace PatternMatch; @@ -421,49 +422,30 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Value *TrueVal = SI.getTrueValue(); Value *FalseVal = SI.getFalseValue(); - // select true, X, Y -> X - // select false, X, Y -> Y - if (ConstantInt *C = dyn_cast<ConstantInt>(CondVal)) - return ReplaceInstUsesWith(SI, C->getZExtValue() ? TrueVal : FalseVal); - - // select C, X, X -> X - if (TrueVal == FalseVal) - return ReplaceInstUsesWith(SI, TrueVal); - - if (isa<UndefValue>(TrueVal)) // select C, undef, X -> X - return ReplaceInstUsesWith(SI, FalseVal); - if (isa<UndefValue>(FalseVal)) // select C, X, undef -> X - return ReplaceInstUsesWith(SI, TrueVal); - if (isa<UndefValue>(CondVal)) { // select undef, X, Y -> X or Y - if (isa<Constant>(TrueVal)) - return ReplaceInstUsesWith(SI, TrueVal); - else - return ReplaceInstUsesWith(SI, FalseVal); - } + if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, TD)) + return ReplaceInstUsesWith(SI, V); if (SI.getType()->isIntegerTy(1)) { if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) { if (C->getZExtValue()) { // Change: A = select B, true, C --> A = or B, C return BinaryOperator::CreateOr(CondVal, FalseVal); - } else { - // Change: A = select B, false, C --> A = and !B, C - Value *NotCond = - InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, - "not."+CondVal->getName()), SI); - return BinaryOperator::CreateAnd(NotCond, FalseVal); } + // Change: A = select B, false, C --> A = and !B, C + Value *NotCond = + InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, + "not."+CondVal->getName()), SI); + return BinaryOperator::CreateAnd(NotCond, FalseVal); } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) { if (C->getZExtValue() == false) { // Change: A = select B, C, false --> A = and B, C return BinaryOperator::CreateAnd(CondVal, TrueVal); - } else { - // Change: A = select B, C, true --> A = or !B, C - Value *NotCond = - InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, - "not."+CondVal->getName()), SI); - return BinaryOperator::CreateOr(NotCond, TrueVal); } + // Change: A = select B, C, true --> A = or !B, C + Value *NotCond = + InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, + "not."+CondVal->getName()), SI); + return BinaryOperator::CreateOr(NotCond, TrueVal); } // select a, b, a -> a&b diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile index ea4a1158acc7..e527be25decb 100644 --- a/lib/Transforms/Makefile +++ b/lib/Transforms/Makefile @@ -13,7 +13,7 @@ PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello include $(LEVEL)/Makefile.config # No support for plugins on windows targets -ifeq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW)) +ifeq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW Minix)) PARALLEL_DIRS := $(filter-out Hello, $(PARALLEL_DIRS)) endif diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 683c1c2fd708..57788642f23f 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -21,7 +21,6 @@ add_llvm_library(LLVMScalarOpts Reassociate.cpp Reg2Mem.cpp SCCP.cpp - SCCVN.cpp Scalar.cpp ScalarReplAggregates.cpp SimplifyCFGPass.cpp diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 642d59da044f..321def7eb619 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1217,7 +1217,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, return ConstantFoldLoadFromConstPtr(Src, &TD); } - +namespace { struct AvailableValueInBlock { /// BB - The basic block in question. @@ -1291,6 +1291,8 @@ struct AvailableValueInBlock { } }; +} + /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. @@ -1333,8 +1335,8 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return V; } -static bool isLifetimeStart(Instruction *Inst) { - if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) +static bool isLifetimeStart(const Instruction *Inst) { + if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) return II->getIntrinsicID() == Intrinsic::lifetime_start; return false; } diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 6605666e45d1..36bea6795542 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -97,6 +97,8 @@ namespace { private: + void EliminateIVComparisons(); + void EliminateIVRemainders(); void RewriteNonIntegerIVs(Loop *L); ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, @@ -133,6 +135,24 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, BasicBlock *ExitingBlock, BranchInst *BI, SCEVExpander &Rewriter) { + // Special case: If the backedge-taken count is a UDiv, it's very likely a + // UDiv that ScalarEvolution produced in order to compute a precise + // expression, rather than a UDiv from the user's code. If we can't find a + // UDiv in the code with some simple searching, assume the former and forego + // rewriting the loop. + if (isa<SCEVUDivExpr>(BackedgeTakenCount)) { + ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!OrigCond) return 0; + const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); + R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); + if (R != BackedgeTakenCount) { + const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); + L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); + if (L != BackedgeTakenCount) + return 0; + } + } + // If the exiting block is not the same as the backedge block, we must compare // against the preincremented value, otherwise we prefer to compare against // the post-incremented value. @@ -142,12 +162,12 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, // Add one to the "backedge-taken" count to get the trip count. // If this addition may overflow, we have to be more pessimistic and // cast the induction variable before doing the add. - const SCEV *Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType()); + const SCEV *Zero = SE->getConstant(BackedgeTakenCount->getType(), 0); const SCEV *N = SE->getAddExpr(BackedgeTakenCount, - SE->getIntegerSCEV(1, BackedgeTakenCount->getType())); + SE->getConstant(BackedgeTakenCount->getType(), 1)); if ((isa<SCEVConstant>(N) && !N->isZero()) || - SE->isLoopGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { + SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { // No overflow. Cast the sum. RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType()); } else { @@ -155,7 +175,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, IndVar->getType()); RHS = SE->getAddExpr(RHS, - SE->getIntegerSCEV(1, IndVar->getType())); + SE->getConstant(IndVar->getType(), 1)); } // The BackedgeTaken expression contains the number of times that the @@ -336,6 +356,116 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { SE->forgetLoop(L); } +void IndVarSimplify::EliminateIVComparisons() { + SmallVector<WeakVH, 16> DeadInsts; + + // Look for ICmp users. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + IVStrideUse &UI = *I; + ICmpInst *ICmp = dyn_cast<ICmpInst>(UI.getUser()); + if (!ICmp) continue; + + bool Swapped = UI.getOperandValToReplace() == ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + if (Swapped) Pred = ICmpInst::getSwappedPredicate(Pred); + + // Get the SCEVs for the ICmp operands. + const SCEV *S = IU->getReplacementExpr(UI); + const SCEV *X = SE->getSCEV(ICmp->getOperand(!Swapped)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // If the condition is always true or always false, replace it with + // a constant value. + if (SE->isKnownPredicate(Pred, S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); + else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); + else + continue; + + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + DeadInsts.push_back(ICmp); + } + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + +void IndVarSimplify::EliminateIVRemainders() { + SmallVector<WeakVH, 16> DeadInsts; + + // Look for SRem and URem users. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + IVStrideUse &UI = *I; + BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser()); + if (!Rem) continue; + + bool isSigned = Rem->getOpcode() == Instruction::SRem; + if (!isSigned && Rem->getOpcode() != Instruction::URem) + continue; + + // We're only interested in the case where we know something about + // the numerator. + if (UI.getOperandValToReplace() != Rem->getOperand(0)) + continue; + + // Get the SCEVs for the ICmp operands. + const SCEV *S = SE->getSCEV(Rem->getOperand(0)); + const SCEV *X = SE->getSCEV(Rem->getOperand(1)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // i % n --> i if i is in [0,n). + if ((!isSigned || SE->isKnownNonNegative(S)) && + SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + S, X)) + Rem->replaceAllUsesWith(Rem->getOperand(0)); + else { + // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). + const SCEV *LessOne = + SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); + if ((!isSigned || SE->isKnownNonNegative(LessOne)) && + SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + LessOne, X)) { + ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, + Rem->getOperand(0), Rem->getOperand(1), + "tmp"); + SelectInst *Sel = + SelectInst::Create(ICmp, + ConstantInt::get(Rem->getType(), 0), + Rem->getOperand(0), "tmp", Rem); + Rem->replaceAllUsesWith(Sel); + } else + continue; + } + + // Inform IVUsers about the new users. + if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) + IU->AddUsersIfInteresting(I); + + DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); + DeadInsts.push_back(Rem); + } + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { IU = &getAnalysis<IVUsers>(); LI = &getAnalysis<LoopInfo>(); @@ -362,6 +492,12 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) RewriteLoopExitValues(L, Rewriter); + // Simplify ICmp IV users. + EliminateIVComparisons(); + + // Simplify SRem and URem IV users. + EliminateIVRemainders(); + // Compute the type of the largest recurrence expression, and decide whether // a canonical induction variable should be inserted. const Type *LargestType = 0; @@ -454,6 +590,46 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } +// FIXME: It is an extremely bad idea to indvar substitute anything more +// complex than affine induction variables. Doing so will put expensive +// polynomial evaluations inside of the loop, and the str reduction pass +// currently can only reduce affine polynomials. For now just disable +// indvar subst on anything more complex than an affine addrec, unless +// it can be expanded to a trivial value. +static bool isSafe(const SCEV *S, const Loop *L) { + // Loop-invariant values are safe. + if (S->isLoopInvariant(L)) return true; + + // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how + // to transform them into efficient code. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) + return AR->isAffine(); + + // An add is safe it all its operands are safe. + if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) { + for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), + E = Commutative->op_end(); I != E; ++I) + if (!isSafe(*I, L)) return false; + return true; + } + + // A cast is safe if its operand is. + if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) + return isSafe(C->getOperand(), L); + + // A udiv is safe if its operands are. + if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S)) + return isSafe(UD->getLHS(), L) && + isSafe(UD->getRHS(), L); + + // SCEVUnknown is always safe. + if (isa<SCEVUnknown>(S)) + return true; + + // Nothing else is safe. + return false; +} + void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { SmallVector<WeakVH, 16> DeadInsts; @@ -465,7 +641,6 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // the need for the code evaluation methods to insert induction variables // of different sizes. for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) { - const SCEV *Stride = UI->getStride(); Value *Op = UI->getOperandValToReplace(); const Type *UseTy = Op->getType(); Instruction *User = UI->getUser(); @@ -486,7 +661,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // currently can only reduce affine polynomials. For now just disable // indvar subst on anything more complex than an affine addrec, unless // it can be expanded to a trivial value. - if (!AR->isLoopInvariant(L) && !Stride->isLoopInvariant(L)) + if (!isSafe(AR, L)) continue; // Determine the insertion point for this user. By default, insert diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index a6489ecc2dcf..df05b712dcbf 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -670,8 +670,10 @@ bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB, Value *OldCond = DestBI->getCondition(); DestBI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), BranchDir)); - ConstantFoldTerminator(BB); + // Delete dead instructions before we fold the branch. Folding the branch + // can eliminate edges from the CFG which can end up deleting OldCond. RecursivelyDeleteTriviallyDeadInstructions(OldCond); + ConstantFoldTerminator(BB); return true; } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index d7ace342fcba..73473952912e 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -683,16 +683,18 @@ void LICM::PromoteValuesInLoop() { // to LI as we are loading or storing. Since we know that the value is // stored in this loop, this will always succeed. for (Value::use_iterator UI = Ptr->use_begin(), E = Ptr->use_end(); - UI != E; ++UI) - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + UI != E; ++UI) { + User *U = *UI; + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { LoadValue = LI; break; - } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) { + } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getOperand(1) == Ptr) { LoadValue = SI->getOperand(0); break; } } + } assert(LoadValue && "No store through the pointer found!"); PointerValueNumbers.push_back(LoadValue); // Remember this for later. } diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp index 16d3f2f703ff..101ff5b20019 100644 --- a/lib/Transforms/Scalar/LoopIndexSplit.cpp +++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp @@ -948,6 +948,25 @@ bool LoopIndexSplit::splitLoop() { if (!IVBasedValues.count(SplitCondition->getOperand(!SVOpNum))) return false; + // Check for side effects. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + + assert(DT->dominates(Header, BB)); + if (DT->properlyDominates(SplitCondition->getParent(), BB)) + continue; + + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); + BI != BE; ++BI) { + Instruction *Inst = BI; + + if (!Inst->isSafeToSpeculativelyExecute() && !isa<PHINode>(Inst) + && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) + return false; + } + } + // Normalize loop conditions so that it is easier to calculate new loop // bounds. if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) { diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 625a75d6cca9..cf3d16f05dbb 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -221,7 +221,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L, if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) if (!AR->getStart()->isZero()) { DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); - DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), AR->getLoop()), L, Good, Bad, SE, DT); @@ -262,11 +262,15 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, SmallVector<const SCEV *, 4> Bad; DoInitialMatch(S, L, Good, Bad, SE, DT); if (!Good.empty()) { - BaseRegs.push_back(SE.getAddExpr(Good)); + const SCEV *Sum = SE.getAddExpr(Good); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); AM.HasBaseReg = true; } if (!Bad.empty()) { - BaseRegs.push_back(SE.getAddExpr(Bad)); + const SCEV *Sum = SE.getAddExpr(Bad); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); AM.HasBaseReg = true; } } @@ -375,7 +379,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, bool IgnoreSignificantBits = false) { // Handle the trivial case, which works for any SCEV type. if (LHS == RHS) - return SE.getIntegerSCEV(1, LHS->getType()); + return SE.getConstant(LHS->getType(), 1); // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some // folding. @@ -450,7 +454,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { if (C->getValue()->getValue().getMinSignedBits() <= 64) { - S = SE.getIntegerSCEV(0, C->getType()); + S = SE.getConstant(C->getType(), 0); return C->getValue()->getSExtValue(); } } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -473,7 +477,7 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { - S = SE.getIntegerSCEV(0, GV->getType()); + S = SE.getConstant(GV->getType(), 0); return GV; } } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -781,10 +785,10 @@ struct LSRFixup { /// will be replaced. Value *OperandValToReplace; - /// PostIncLoop - If this user is to use the post-incremented value of an + /// PostIncLoops - If this user is to use the post-incremented value of an /// induction variable, this variable is non-null and holds the loop /// associated with the induction variable. - const Loop *PostIncLoop; + PostIncLoopSet PostIncLoops; /// LUIdx - The index of the LSRUse describing the expression which /// this fixup needs, minus an offset (below). @@ -795,6 +799,8 @@ struct LSRFixup { /// offsets, for example in an unrolled loop. int64_t Offset; + bool isUseFullyOutsideLoop(const Loop *L) const; + LSRFixup(); void print(raw_ostream &OS) const; @@ -804,9 +810,24 @@ struct LSRFixup { } LSRFixup::LSRFixup() - : UserInst(0), OperandValToReplace(0), PostIncLoop(0), + : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} +/// isUseFullyOutsideLoop - Test whether this fixup always uses its +/// value outside of the given loop. +bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { + // PHI nodes use their value in their incoming blocks. + if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == OperandValToReplace && + L->contains(PN->getIncomingBlock(i))) + return false; + return true; + } + + return !L->contains(UserInst); +} + void LSRFixup::print(raw_ostream &OS) const { OS << "UserInst="; // Store is common and interesting enough to be worth special-casing. @@ -821,9 +842,10 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", OperandValToReplace="; WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); - if (PostIncLoop) { + for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), + E = PostIncLoops.end(); I != E; ++I) { OS << ", PostIncLoop="; - WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false); + WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); } if (LUIdx != ~size_t(0)) @@ -1135,6 +1157,7 @@ class LSRInstance { IVUsers &IU; ScalarEvolution &SE; DominatorTree &DT; + LoopInfo &LI; const TargetLowering *const TLI; Loop *const L; bool Changed; @@ -1214,6 +1237,13 @@ public: DenseSet<const SCEV *> &VisitedRegs) const; void Solve(SmallVectorImpl<const Formula *> &Solution) const; + BasicBlock::iterator + HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) const; + BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU) const; + Value *Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, @@ -1427,16 +1457,30 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) return Cond; - const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType()); + const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); // Add one to the backedge-taken count to get the trip count. const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One); - - // Check for a max calculation that matches the pattern. - if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount)) + if (IterationCount != SE.getSCEV(Sel)) return Cond; + + // Check for a max calculation that matches the pattern. There's no check + // for ICMP_ULE here because the comparison would be with zero, which + // isn't interesting. + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + const SCEVNAryExpr *Max = 0; + if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { + Pred = ICmpInst::ICMP_SLE; + Max = S; + } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_SLT; + Max = S; + } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_ULT; + Max = U; + } else { + // No match; bail. return Cond; - const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount); - if (Max != SE.getSCEV(Sel)) return Cond; + } // To handle a max with more than two operands, this optimization would // require additional checking and setup. @@ -1445,7 +1489,13 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { const SCEV *MaxLHS = Max->getOperand(0); const SCEV *MaxRHS = Max->getOperand(1); - if (!MaxLHS || MaxLHS != One) return Cond; + + // ScalarEvolution canonicalizes constants to the left. For < and >, look + // for a comparison with 1. For <= and >=, a comparison with zero. + if (!MaxLHS || + (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) + return Cond; + // Check the relevant induction variable for conformance to // the pattern. const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); @@ -1461,16 +1511,29 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // Check the right operand of the select, and remember it, as it will // be used in the new comparison instruction. Value *NewRHS = 0; - if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) + if (ICmpInst::isTrueWhenEqual(Pred)) { + // Look for n+1, and grab n. + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (!NewRHS) + return Cond; + } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) NewRHS = Sel->getOperand(1); else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) NewRHS = Sel->getOperand(2); - if (!NewRHS) return Cond; + else + llvm_unreachable("Max doesn't match expected pattern!"); // Determine the new comparison opcode. It may be signed or unsigned, // and the original comparison may be either equality or inequality. - CmpInst::Predicate Pred = - isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; if (Cond->getPredicate() == CmpInst::ICMP_EQ) Pred = CmpInst::getInversePredicate(Pred); @@ -1545,8 +1608,9 @@ LSRInstance::OptimizeLoopTermCond() { !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { // Conservatively assume there may be reuse if the quotient of their // strides could be a legal scale. - const SCEV *A = CondUse->getStride(); - const SCEV *B = UI->getStride(); + const SCEV *A = IU.getStride(*CondUse, L); + const SCEV *B = IU.getStride(*UI, L); + if (!A || !B) continue; if (SE.getTypeSizeInBits(A->getType()) != SE.getTypeSizeInBits(B->getType())) { if (SE.getTypeSizeInBits(A->getType()) > @@ -1598,8 +1662,7 @@ LSRInstance::OptimizeLoopTermCond() { ExitingBlock->getInstList().insert(TermBr, Cond); // Clone the IVUse, as the old use still exists! - CondUse = &IU.AddUser(CondUse->getStride(), CondUse->getOffset(), - Cond, CondUse->getOperandValToReplace()); + CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); TermBr->replaceUsesOfWith(OldCond, Cond); } } @@ -1607,9 +1670,7 @@ LSRInstance::OptimizeLoopTermCond() { // If we get to here, we know that we can transform the setcc instruction to // use the post-incremented version of the IV, allowing us to coalesce the // live ranges for the IV correctly. - CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), - CondUse->getStride())); - CondUse->setIsUseOfPostIncrementedValue(true); + CondUse->transformToPostInc(L); Changed = true; PostIncs.insert(Cond); @@ -1717,19 +1778,24 @@ void LSRInstance::CollectInterestingTypesAndFactors() { SmallSetVector<const SCEV *, 4> Strides; // Collect interesting types and strides. + SmallVector<const SCEV *, 4> Worklist; for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { - const SCEV *Stride = UI->getStride(); + const SCEV *Expr = IU.getExpr(*UI); // Collect interesting types. - Types.insert(SE.getEffectiveSCEVType(Stride->getType())); - - // Add the stride for this loop. - Strides.insert(Stride); - - // Add strides for other mentioned loops. - for (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(UI->getOffset()); - AR; AR = dyn_cast<SCEVAddRecExpr>(AR->getStart())) - Strides.insert(AR->getStepRecurrence(SE)); + Types.insert(SE.getEffectiveSCEVType(Expr->getType())); + + // Add strides for mentioned loops. + Worklist.push_back(Expr); + do { + const SCEV *S = Worklist.pop_back_val(); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + Strides.insert(AR->getStepRecurrence(SE)); + Worklist.push_back(AR->getStart()); + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end()); + } + } while (!Worklist.empty()); } // Compute interesting factors from the set of interesting strides. @@ -1776,8 +1842,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = UI->getUser(); LF.OperandValToReplace = UI->getOperandValToReplace(); - if (UI->isUseOfPostIncrementedValue()) - LF.PostIncLoop = L; + LF.PostIncLoops = UI->getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; const Type *AccessTy = 0; @@ -1786,7 +1851,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { AccessTy = getAccessType(LF.UserInst); } - const SCEV *S = IU.getCanonicalExpr(*UI); + const SCEV *S = IU.getExpr(*UI); // Equality (== and !=) ICmps are special. We can rewrite (i == N) as // (N - i == 0), and this allows (N - i) to be the expression that we work @@ -1824,7 +1889,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; - LU.AllFixupsOutsideLoop &= !L->contains(LF.UserInst); + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); // If this is the first use of this LSRUse, give it a formula. if (LU.Formulae.empty()) { @@ -1918,9 +1983,17 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { continue; // Ignore uses which are part of other SCEV expressions, to avoid // analyzing them multiple times. - if (SE.isSCEVable(UserInst->getType()) && - !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst)))) - continue; + if (SE.isSCEVable(UserInst->getType())) { + const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); + // If the user is a no-op, look through to its uses. + if (!isa<SCEVUnknown>(UserS)) + continue; + if (UserS == U) { + Worklist.push_back( + SE.getUnknown(const_cast<Instruction *>(UserInst))); + continue; + } + } // Ignore icmp instructions which are already being analyzed. if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { unsigned OtherIdx = !UI.getOperandNo(); @@ -1936,7 +2009,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; - LU.AllFixupsOutsideLoop &= L->contains(LF.UserInst); + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); InsertSupplementalFormula(U, LU, LF.LUIdx); CountRegisters(LU.Formulae.back(), Uses.size() - 1); break; @@ -1959,7 +2032,7 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. if (!AR->getStart()->isZero()) { - CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), AR->getLoop()), C, Ops, SE); CollectSubexprs(AR->getStart(), C, Ops, SE); @@ -2020,8 +2093,11 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, LU.Kind, LU.AccessTy, TLI, SE)) continue; + const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); + if (InnerSum->isZero()) + continue; Formula F = Base; - F.BaseRegs[i] = SE.getAddExpr(InnerAddOps); + F.BaseRegs[i] = InnerSum; F.BaseRegs.push_back(*J); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -2102,7 +2178,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, LU.AccessTy, TLI)) { - F.BaseRegs[i] = SE.getAddExpr(G, SE.getIntegerSCEV(*I, G->getType())); + F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I)); (void)InsertFormula(LU, LUIdx, F); } @@ -2165,7 +2241,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Compensate for the use having MinOffset built into it. F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; - const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy); + const SCEV *FactorS = SE.getConstant(IntTy, Factor); // Check that multiplying with each base register doesn't overflow. for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { @@ -2227,7 +2303,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { - const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy); + const SCEV *FactorS = SE.getConstant(IntTy, Factor); if (FactorS->isZero()) continue; // Divide out the factor, ignoring high bits, since we'll be @@ -2426,7 +2502,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (C->getValue()->getValue().isNegative() != (NewF.AM.BaseOffs < 0) && (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) - .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs())) + .ule(abs64(NewF.AM.BaseOffs))) continue; // OK, looks good. @@ -2454,7 +2530,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (C->getValue()->getValue().isNegative() != (NewF.AM.BaseOffs < 0) && C->getValue()->getValue().abs() - .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs())) + .ule(abs64(NewF.AM.BaseOffs))) goto skip_formula; // Ok, looks good. @@ -2776,37 +2852,33 @@ static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) { return Node->getBlock(); } -Value *LSRInstance::Expand(const LSRFixup &LF, - const Formula &F, - BasicBlock::iterator IP, - SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const { - const LSRUse &LU = Uses[LF.LUIdx]; - - // Then, collect some instructions which we will remain dominated by when - // expanding the replacement. These must be dominated by any operands that - // will be required in the expansion. - SmallVector<Instruction *, 4> Inputs; - if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) - Inputs.push_back(I); - if (LU.Kind == LSRUse::ICmpZero) - if (Instruction *I = - dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) - Inputs.push_back(I); - if (LF.PostIncLoop) { - if (!L->contains(LF.UserInst)) - Inputs.push_back(L->getLoopLatch()->getTerminator()); - else - Inputs.push_back(IVIncInsertPos); - } - - // Then, climb up the immediate dominator tree as far as we can go while - // still being dominated by the input positions. +/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up +/// the dominator tree far as we can go while still being dominated by the +/// input positions. This helps canonicalize the insert position, which +/// encourages sharing. +BasicBlock::iterator +LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) + const { for (;;) { + const Loop *IPLoop = LI.getLoopFor(IP->getParent()); + unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; + + BasicBlock *IDom; + for (BasicBlock *Rung = IP->getParent(); ; Rung = IDom) { + IDom = getImmediateDominator(Rung, DT); + if (!IDom) return IP; + + // Don't climb into a loop though. + const Loop *IDomLoop = LI.getLoopFor(IDom); + unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; + if (IDomDepth <= IPLoopDepth && + (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) + break; + } + bool AllDominate = true; Instruction *BetterPos = 0; - BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT); - if (!IDom) break; Instruction *Tentative = IDom->getTerminator(); for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), E = Inputs.end(); I != E; ++I) { @@ -2815,6 +2887,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, AllDominate = false; break; } + // Attempt to find an insert position in the middle of the block, + // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || DT.dominates(BetterPos, Inst))) BetterPos = next(BasicBlock::iterator(Inst)); @@ -2826,12 +2900,77 @@ Value *LSRInstance::Expand(const LSRFixup &LF, else IP = Tentative; } + + return IP; +} + +/// AdjustInsertPositionForExpand - Determine an input position which will be +/// dominated by the operands and which will dominate the result. +BasicBlock::iterator +LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU) const { + // Collect some instructions which must be dominated by the + // expanding replacement. These must be dominated by any operands that + // will be required in the expansion. + SmallVector<Instruction *, 4> Inputs; + if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) + Inputs.push_back(I); + if (LU.Kind == LSRUse::ICmpZero) + if (Instruction *I = + dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) + Inputs.push_back(I); + if (LF.PostIncLoops.count(L)) { + if (LF.isUseFullyOutsideLoop(L)) + Inputs.push_back(L->getLoopLatch()->getTerminator()); + else + Inputs.push_back(IVIncInsertPos); + } + // The expansion must also be dominated by the increment positions of any + // loops it for which it is using post-inc mode. + for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), + E = LF.PostIncLoops.end(); I != E; ++I) { + const Loop *PIL = *I; + if (PIL == L) continue; + + // Be dominated by the loop exit. + SmallVector<BasicBlock *, 4> ExitingBlocks; + PIL->getExitingBlocks(ExitingBlocks); + if (!ExitingBlocks.empty()) { + BasicBlock *BB = ExitingBlocks[0]; + for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) + BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); + Inputs.push_back(BB->getTerminator()); + } + } + + // Then, climb up the immediate dominator tree as far as we can go while + // still being dominated by the input positions. + IP = HoistInsertPosition(IP, Inputs); + + // Don't insert instructions before PHI nodes. while (isa<PHINode>(IP)) ++IP; + + // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; + return IP; +} + +Value *LSRInstance::Expand(const LSRFixup &LF, + const Formula &F, + BasicBlock::iterator IP, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) const { + const LSRUse &LU = Uses[LF.LUIdx]; + + // Determine an input position which will be dominated by the operands and + // which will dominate the result. + IP = AdjustInsertPositionForExpand(IP, LF, LU); + // Inform the Rewriter if we have a post-increment use, so that it can // perform an advantageous expansion. - Rewriter.setPostInc(LF.PostIncLoop); + Rewriter.setPostInc(LF.PostIncLoops); // This is the type that the user actually needs. const Type *OpTy = LF.OperandValToReplace->getType(); @@ -2855,24 +2994,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *Reg = *I; assert(!Reg->isZero() && "Zero allocated in a base register!"); - // If we're expanding for a post-inc user for the add-rec's loop, make the - // post-inc adjustment. - const SCEV *Start = Reg; - while (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Start)) { - if (AR->getLoop() == LF.PostIncLoop) { - Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE)); - // If the user is inside the loop, insert the code after the increment - // so that it is dominated by its operand. If the original insert point - // was already dominated by the increment, keep it, because there may - // be loop-variant operands that need to be respected also. - if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP)) { - IP = IVIncInsertPos; - while (isa<DbgInfoIntrinsic>(IP)) ++IP; - } - break; - } - Start = AR->getStart(); - } + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + Reg = TransformForPostIncUse(Denormalize, Reg, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); } @@ -2889,11 +3015,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.AM.Scale != 0) { const SCEV *ScaledS = F.ScaledReg; - // If we're expanding for a post-inc user for the add-rec's loop, make the - // post-inc adjustment. - if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS)) - if (AR->getLoop() == LF.PostIncLoop) - ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE)); + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + ScaledS = TransformForPostIncUse(Denormalize, ScaledS, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); if (LU.Kind == LSRUse::ICmpZero) { // An interesting way of "folding" with an icmp is to use a negated @@ -2907,8 +3033,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // which is expected to be matched as part of the address. ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); ScaledS = SE.getMulExpr(ScaledS, - SE.getIntegerSCEV(F.AM.Scale, - ScaledS->getType())); + SE.getConstant(ScaledS->getType(), F.AM.Scale)); Ops.push_back(ScaledS); // Flush the operand list to suppress SCEVExpander hoisting. @@ -2949,12 +3074,12 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Emit instructions summing all the operands. const SCEV *FullS = Ops.empty() ? - SE.getIntegerSCEV(0, IntTy) : + SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); // We're done expanding now, so reset the rewriter. - Rewriter.setPostInc(0); + Rewriter.clearPostInc(); // An ICmpZero Formula represents an ICmp which we're handling as a // comparison against zero. Now that we've expanded an expression for that @@ -3118,6 +3243,7 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTree>()), + LI(P->getAnalysis<LoopInfo>()), TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { // If LoopSimplify form is not available, stay out of trouble. @@ -3274,9 +3400,10 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // We split critical edges, so we change the CFG. However, we do update // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addPreserved<LoopInfo>(); AU.addPreserved("domfrontier"); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTree>(); AU.addPreserved<DominatorTree>(); diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 3918738cc8be..ae7bf40e0e1a 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -34,6 +34,7 @@ #include "llvm/Instructions.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/Dominators.h" @@ -677,15 +678,22 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, LoopProcessWorklist.push_back(NewLoop); redoLoop = true; + // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody + // deletes the instruction (for example by simplifying a PHI that feeds into + // the condition that we're unswitching on), we don't rewrite the second + // iteration. + WeakVH LICHandle(LIC); + // Now we rewrite the original code to know that the condition is true and the // new code to know that the condition is false. RewriteLoopBodyWithConditionConstant(L, LIC, Val, false); - - // It's possible that simplifying one loop could cause the other to be - // deleted. If so, don't simplify it. - if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop) - RewriteLoopBodyWithConditionConstant(NewLoop, LIC, Val, true); + // It's possible that simplifying one loop could cause the other to be + // changed to another value or a constant. If its a constant, don't simplify + // it. + if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop && + LICHandle && !isa<Constant>(LICHandle)) + RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); } /// RemoveFromWorklist - Remove all instances of I from the worklist vector @@ -981,45 +989,16 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { continue; } + // See if instruction simplification can hack this up. This is common for + // things like "select false, X, Y" after unswitching made the condition be + // 'false'. + if (Value *V = SimplifyInstruction(I)) { + ReplaceUsesOfWith(I, V, Worklist, L, LPM); + continue; + } + // Special case hacks that appear commonly in unswitched code. - switch (I->getOpcode()) { - case Instruction::Select: - if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(0))) { - ReplaceUsesOfWith(I, I->getOperand(!CB->getZExtValue()+1), Worklist, L, - LPM); - continue; - } - break; - case Instruction::And: - if (isa<ConstantInt>(I->getOperand(0)) && - // constant -> RHS - I->getOperand(0)->getType()->isIntegerTy(1)) - cast<BinaryOperator>(I)->swapOperands(); - if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) - if (CB->getType()->isIntegerTy(1)) { - if (CB->isOne()) // X & 1 -> X - ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM); - else // X & 0 -> 0 - ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM); - continue; - } - break; - case Instruction::Or: - if (isa<ConstantInt>(I->getOperand(0)) && - // constant -> RHS - I->getOperand(0)->getType()->isIntegerTy(1)) - cast<BinaryOperator>(I)->swapOperands(); - if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) - if (CB->getType()->isIntegerTy(1)) { - if (CB->isOne()) // X | 1 -> 1 - ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM); - else // X | 0 -> X - ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM); - continue; - } - break; - case Instruction::Br: { - BranchInst *BI = cast<BranchInst>(I); + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { if (BI->isUnconditional()) { // If BI's parent is the only pred of the successor, fold the two blocks // together. @@ -1052,13 +1031,13 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { LPM->deleteSimpleAnalysisValue(Succ, L); Succ->eraseFromParent(); ++NumSimplify; - break; + continue; } if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ // Conditional branch. Turn it into an unconditional branch, then // remove dead blocks. - break; // FIXME: Enable. + continue; // FIXME: Enable. DEBUG(dbgs() << "Folded branch: " << *BI); BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); @@ -1072,8 +1051,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { RemoveBlockIfDead(DeadSucc, Worklist, L); } - break; - } + continue; } } } diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 3b305ae766e8..3611b8ebe562 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -744,7 +744,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { const Type *ArgTys[3] = { M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType() }; - M->setOperand(0,Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, ArgTys, 3)); + M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, ArgTys, 3)); // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 7a6eec3d46b4..13222ac22004 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -46,10 +46,11 @@ namespace { bool valueEscapes(const Instruction *Inst) const { const BasicBlock *BB = Inst->getParent(); for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end(); - UI != E; ++UI) - if (cast<Instruction>(*UI)->getParent() != BB || - isa<PHINode>(*UI)) + UI != E; ++UI) { + const Instruction *I = cast<Instruction>(*UI); + if (I->getParent() != BB || isa<PHINode>(I)) return true; + } return false; } diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 4f09bee53a81..907ece8fcce9 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -317,7 +317,10 @@ private: void markConstant(LatticeVal &IV, Value *V, Constant *C) { if (!IV.markConstant(C)) return; DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); - InstWorkList.push_back(V); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); } void markConstant(Value *V, Constant *C) { @@ -327,9 +330,13 @@ private: void markForcedConstant(Value *V, Constant *C) { assert(!V->getType()->isStructTy() && "Should use other method"); - ValueState[V].markForcedConstant(C); + LatticeVal &IV = ValueState[V]; + IV.markForcedConstant(C); DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); - InstWorkList.push_back(V); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); } @@ -1445,6 +1452,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // After a zero extend, we know the top part is zero. SExt doesn't have // to be handled here, because we don't know whether the top part is 1's // or 0's. + case Instruction::SIToFP: // some FP values are not possible, just use 0. + case Instruction::UIToFP: // some FP values are not possible, just use 0. markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: diff --git a/lib/Transforms/Scalar/SCCVN.cpp b/lib/Transforms/Scalar/SCCVN.cpp deleted file mode 100644 index 9685a2945f8c..000000000000 --- a/lib/Transforms/Scalar/SCCVN.cpp +++ /dev/null @@ -1,716 +0,0 @@ -//===- SCCVN.cpp - Eliminate redundant values -----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass performs global value numbering to eliminate fully redundant -// instructions. This is based on the paper "SCC-based Value Numbering" -// by Cooper, et al. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "sccvn" -#include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Operator.h" -#include "llvm/Value.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SparseBitVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -using namespace llvm; - -STATISTIC(NumSCCVNInstr, "Number of instructions deleted by SCCVN"); -STATISTIC(NumSCCVNPhi, "Number of phis deleted by SCCVN"); - -//===----------------------------------------------------------------------===// -// ValueTable Class -//===----------------------------------------------------------------------===// - -/// This class holds the mapping between values and value numbers. It is used -/// as an efficient mechanism to determine the expression-wise equivalence of -/// two values. -namespace { - struct Expression { - enum ExpressionOpcode { ADD, FADD, SUB, FSUB, MUL, FMUL, - UDIV, SDIV, FDIV, UREM, SREM, - FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, - ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, - ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, - FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, - FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, - FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT, - SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI, - FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, - PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, CONSTANT, - INSERTVALUE, EXTRACTVALUE, EMPTY, TOMBSTONE }; - - ExpressionOpcode opcode; - const Type* type; - SmallVector<uint32_t, 4> varargs; - - Expression() { } - Expression(ExpressionOpcode o) : opcode(o) { } - - bool operator==(const Expression &other) const { - if (opcode != other.opcode) - return false; - else if (opcode == EMPTY || opcode == TOMBSTONE) - return true; - else if (type != other.type) - return false; - else { - if (varargs.size() != other.varargs.size()) - return false; - - for (size_t i = 0; i < varargs.size(); ++i) - if (varargs[i] != other.varargs[i]) - return false; - - return true; - } - } - - bool operator!=(const Expression &other) const { - return !(*this == other); - } - }; - - class ValueTable { - private: - DenseMap<Value*, uint32_t> valueNumbering; - DenseMap<Expression, uint32_t> expressionNumbering; - DenseMap<Value*, uint32_t> constantsNumbering; - - uint32_t nextValueNumber; - - Expression::ExpressionOpcode getOpcode(BinaryOperator* BO); - Expression::ExpressionOpcode getOpcode(CmpInst* C); - Expression::ExpressionOpcode getOpcode(CastInst* C); - Expression create_expression(BinaryOperator* BO); - Expression create_expression(CmpInst* C); - Expression create_expression(ShuffleVectorInst* V); - Expression create_expression(ExtractElementInst* C); - Expression create_expression(InsertElementInst* V); - Expression create_expression(SelectInst* V); - Expression create_expression(CastInst* C); - Expression create_expression(GetElementPtrInst* G); - Expression create_expression(CallInst* C); - Expression create_expression(Constant* C); - Expression create_expression(ExtractValueInst* C); - Expression create_expression(InsertValueInst* C); - public: - ValueTable() : nextValueNumber(1) { } - uint32_t computeNumber(Value *V); - uint32_t lookup(Value *V); - void add(Value *V, uint32_t num); - void clear(); - void clearExpressions(); - void erase(Value *v); - unsigned size(); - void verifyRemoved(const Value *) const; - }; -} - -namespace llvm { -template <> struct DenseMapInfo<Expression> { - static inline Expression getEmptyKey() { - return Expression(Expression::EMPTY); - } - - static inline Expression getTombstoneKey() { - return Expression(Expression::TOMBSTONE); - } - - static unsigned getHashValue(const Expression e) { - unsigned hash = e.opcode; - - hash = ((unsigned)((uintptr_t)e.type >> 4) ^ - (unsigned)((uintptr_t)e.type >> 9)); - - for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(), - E = e.varargs.end(); I != E; ++I) - hash = *I + hash * 37; - - return hash; - } - static bool isEqual(const Expression &LHS, const Expression &RHS) { - return LHS == RHS; - } -}; -template <> -struct isPodLike<Expression> { static const bool value = true; }; - -} - -//===----------------------------------------------------------------------===// -// ValueTable Internal Functions -//===----------------------------------------------------------------------===// -Expression::ExpressionOpcode ValueTable::getOpcode(BinaryOperator* BO) { - switch(BO->getOpcode()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Binary operator with unknown opcode?"); - case Instruction::Add: return Expression::ADD; - case Instruction::FAdd: return Expression::FADD; - case Instruction::Sub: return Expression::SUB; - case Instruction::FSub: return Expression::FSUB; - case Instruction::Mul: return Expression::MUL; - case Instruction::FMul: return Expression::FMUL; - case Instruction::UDiv: return Expression::UDIV; - case Instruction::SDiv: return Expression::SDIV; - case Instruction::FDiv: return Expression::FDIV; - case Instruction::URem: return Expression::UREM; - case Instruction::SRem: return Expression::SREM; - case Instruction::FRem: return Expression::FREM; - case Instruction::Shl: return Expression::SHL; - case Instruction::LShr: return Expression::LSHR; - case Instruction::AShr: return Expression::ASHR; - case Instruction::And: return Expression::AND; - case Instruction::Or: return Expression::OR; - case Instruction::Xor: return Expression::XOR; - } -} - -Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) { - if (isa<ICmpInst>(C)) { - switch (C->getPredicate()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Comparison with unknown predicate?"); - case ICmpInst::ICMP_EQ: return Expression::ICMPEQ; - case ICmpInst::ICMP_NE: return Expression::ICMPNE; - case ICmpInst::ICMP_UGT: return Expression::ICMPUGT; - case ICmpInst::ICMP_UGE: return Expression::ICMPUGE; - case ICmpInst::ICMP_ULT: return Expression::ICMPULT; - case ICmpInst::ICMP_ULE: return Expression::ICMPULE; - case ICmpInst::ICMP_SGT: return Expression::ICMPSGT; - case ICmpInst::ICMP_SGE: return Expression::ICMPSGE; - case ICmpInst::ICMP_SLT: return Expression::ICMPSLT; - case ICmpInst::ICMP_SLE: return Expression::ICMPSLE; - } - } else { - switch (C->getPredicate()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Comparison with unknown predicate?"); - case FCmpInst::FCMP_OEQ: return Expression::FCMPOEQ; - case FCmpInst::FCMP_OGT: return Expression::FCMPOGT; - case FCmpInst::FCMP_OGE: return Expression::FCMPOGE; - case FCmpInst::FCMP_OLT: return Expression::FCMPOLT; - case FCmpInst::FCMP_OLE: return Expression::FCMPOLE; - case FCmpInst::FCMP_ONE: return Expression::FCMPONE; - case FCmpInst::FCMP_ORD: return Expression::FCMPORD; - case FCmpInst::FCMP_UNO: return Expression::FCMPUNO; - case FCmpInst::FCMP_UEQ: return Expression::FCMPUEQ; - case FCmpInst::FCMP_UGT: return Expression::FCMPUGT; - case FCmpInst::FCMP_UGE: return Expression::FCMPUGE; - case FCmpInst::FCMP_ULT: return Expression::FCMPULT; - case FCmpInst::FCMP_ULE: return Expression::FCMPULE; - case FCmpInst::FCMP_UNE: return Expression::FCMPUNE; - } - } -} - -Expression::ExpressionOpcode ValueTable::getOpcode(CastInst* C) { - switch(C->getOpcode()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Cast operator with unknown opcode?"); - case Instruction::Trunc: return Expression::TRUNC; - case Instruction::ZExt: return Expression::ZEXT; - case Instruction::SExt: return Expression::SEXT; - case Instruction::FPToUI: return Expression::FPTOUI; - case Instruction::FPToSI: return Expression::FPTOSI; - case Instruction::UIToFP: return Expression::UITOFP; - case Instruction::SIToFP: return Expression::SITOFP; - case Instruction::FPTrunc: return Expression::FPTRUNC; - case Instruction::FPExt: return Expression::FPEXT; - case Instruction::PtrToInt: return Expression::PTRTOINT; - case Instruction::IntToPtr: return Expression::INTTOPTR; - case Instruction::BitCast: return Expression::BITCAST; - } -} - -Expression ValueTable::create_expression(CallInst* C) { - Expression e; - - e.type = C->getType(); - e.opcode = Expression::CALL; - - e.varargs.push_back(lookup(C->getCalledFunction())); - for (CallInst::op_iterator I = C->op_begin()+1, E = C->op_end(); - I != E; ++I) - e.varargs.push_back(lookup(*I)); - - return e; -} - -Expression ValueTable::create_expression(BinaryOperator* BO) { - Expression e; - e.varargs.push_back(lookup(BO->getOperand(0))); - e.varargs.push_back(lookup(BO->getOperand(1))); - e.type = BO->getType(); - e.opcode = getOpcode(BO); - - return e; -} - -Expression ValueTable::create_expression(CmpInst* C) { - Expression e; - - e.varargs.push_back(lookup(C->getOperand(0))); - e.varargs.push_back(lookup(C->getOperand(1))); - e.type = C->getType(); - e.opcode = getOpcode(C); - - return e; -} - -Expression ValueTable::create_expression(CastInst* C) { - Expression e; - - e.varargs.push_back(lookup(C->getOperand(0))); - e.type = C->getType(); - e.opcode = getOpcode(C); - - return e; -} - -Expression ValueTable::create_expression(ShuffleVectorInst* S) { - Expression e; - - e.varargs.push_back(lookup(S->getOperand(0))); - e.varargs.push_back(lookup(S->getOperand(1))); - e.varargs.push_back(lookup(S->getOperand(2))); - e.type = S->getType(); - e.opcode = Expression::SHUFFLE; - - return e; -} - -Expression ValueTable::create_expression(ExtractElementInst* E) { - Expression e; - - e.varargs.push_back(lookup(E->getOperand(0))); - e.varargs.push_back(lookup(E->getOperand(1))); - e.type = E->getType(); - e.opcode = Expression::EXTRACT; - - return e; -} - -Expression ValueTable::create_expression(InsertElementInst* I) { - Expression e; - - e.varargs.push_back(lookup(I->getOperand(0))); - e.varargs.push_back(lookup(I->getOperand(1))); - e.varargs.push_back(lookup(I->getOperand(2))); - e.type = I->getType(); - e.opcode = Expression::INSERT; - - return e; -} - -Expression ValueTable::create_expression(SelectInst* I) { - Expression e; - - e.varargs.push_back(lookup(I->getCondition())); - e.varargs.push_back(lookup(I->getTrueValue())); - e.varargs.push_back(lookup(I->getFalseValue())); - e.type = I->getType(); - e.opcode = Expression::SELECT; - - return e; -} - -Expression ValueTable::create_expression(GetElementPtrInst* G) { - Expression e; - - e.varargs.push_back(lookup(G->getPointerOperand())); - e.type = G->getType(); - e.opcode = Expression::GEP; - - for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end(); - I != E; ++I) - e.varargs.push_back(lookup(*I)); - - return e; -} - -Expression ValueTable::create_expression(ExtractValueInst* E) { - Expression e; - - e.varargs.push_back(lookup(E->getAggregateOperand())); - for (ExtractValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); - II != IE; ++II) - e.varargs.push_back(*II); - e.type = E->getType(); - e.opcode = Expression::EXTRACTVALUE; - - return e; -} - -Expression ValueTable::create_expression(InsertValueInst* E) { - Expression e; - - e.varargs.push_back(lookup(E->getAggregateOperand())); - e.varargs.push_back(lookup(E->getInsertedValueOperand())); - for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); - II != IE; ++II) - e.varargs.push_back(*II); - e.type = E->getType(); - e.opcode = Expression::INSERTVALUE; - - return e; -} - -//===----------------------------------------------------------------------===// -// ValueTable External Functions -//===----------------------------------------------------------------------===// - -/// add - Insert a value into the table with a specified value number. -void ValueTable::add(Value *V, uint32_t num) { - valueNumbering[V] = num; -} - -/// computeNumber - Returns the value number for the specified value, assigning -/// it a new number if it did not have one before. -uint32_t ValueTable::computeNumber(Value *V) { - if (uint32_t v = valueNumbering[V]) - return v; - else if (uint32_t v= constantsNumbering[V]) - return v; - - if (!isa<Instruction>(V)) { - constantsNumbering[V] = nextValueNumber; - return nextValueNumber++; - } - - Instruction* I = cast<Instruction>(V); - Expression exp; - switch (I->getOpcode()) { - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or : - case Instruction::Xor: - exp = create_expression(cast<BinaryOperator>(I)); - break; - case Instruction::ICmp: - case Instruction::FCmp: - exp = create_expression(cast<CmpInst>(I)); - break; - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::BitCast: - exp = create_expression(cast<CastInst>(I)); - break; - case Instruction::Select: - exp = create_expression(cast<SelectInst>(I)); - break; - case Instruction::ExtractElement: - exp = create_expression(cast<ExtractElementInst>(I)); - break; - case Instruction::InsertElement: - exp = create_expression(cast<InsertElementInst>(I)); - break; - case Instruction::ShuffleVector: - exp = create_expression(cast<ShuffleVectorInst>(I)); - break; - case Instruction::ExtractValue: - exp = create_expression(cast<ExtractValueInst>(I)); - break; - case Instruction::InsertValue: - exp = create_expression(cast<InsertValueInst>(I)); - break; - case Instruction::GetElementPtr: - exp = create_expression(cast<GetElementPtrInst>(I)); - break; - default: - valueNumbering[V] = nextValueNumber; - return nextValueNumber++; - } - - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; - valueNumbering[V] = e; - - return e; -} - -/// lookup - Returns the value number of the specified value. Returns 0 if -/// the value has not yet been numbered. -uint32_t ValueTable::lookup(Value *V) { - if (!isa<Instruction>(V)) { - if (!constantsNumbering.count(V)) - constantsNumbering[V] = nextValueNumber++; - return constantsNumbering[V]; - } - - return valueNumbering[V]; -} - -/// clear - Remove all entries from the ValueTable -void ValueTable::clear() { - valueNumbering.clear(); - expressionNumbering.clear(); - constantsNumbering.clear(); - nextValueNumber = 1; -} - -void ValueTable::clearExpressions() { - expressionNumbering.clear(); - constantsNumbering.clear(); - nextValueNumber = 1; -} - -/// erase - Remove a value from the value numbering -void ValueTable::erase(Value *V) { - valueNumbering.erase(V); -} - -/// verifyRemoved - Verify that the value is removed from all internal data -/// structures. -void ValueTable::verifyRemoved(const Value *V) const { - for (DenseMap<Value*, uint32_t>::const_iterator - I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { - assert(I->first != V && "Inst still occurs in value numbering map!"); - } -} - -//===----------------------------------------------------------------------===// -// SCCVN Pass -//===----------------------------------------------------------------------===// - -namespace { - - struct ValueNumberScope { - ValueNumberScope* parent; - DenseMap<uint32_t, Value*> table; - SparseBitVector<128> availIn; - SparseBitVector<128> availOut; - - ValueNumberScope(ValueNumberScope* p) : parent(p) { } - }; - - class SCCVN : public FunctionPass { - bool runOnFunction(Function &F); - public: - static char ID; // Pass identification, replacement for typeid - SCCVN() : FunctionPass(&ID) { } - - private: - ValueTable VT; - DenseMap<BasicBlock*, ValueNumberScope*> BBMap; - - // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); - - AU.addPreserved<DominatorTree>(); - AU.setPreservesCFG(); - } - }; - - char SCCVN::ID = 0; -} - -// createSCCVNPass - The public interface to this file... -FunctionPass *llvm::createSCCVNPass() { return new SCCVN(); } - -static RegisterPass<SCCVN> X("sccvn", - "SCC Value Numbering"); - -static Value *lookupNumber(ValueNumberScope *Locals, uint32_t num) { - while (Locals) { - DenseMap<uint32_t, Value*>::iterator I = Locals->table.find(num); - if (I != Locals->table.end()) - return I->second; - Locals = Locals->parent; - } - - return 0; -} - -bool SCCVN::runOnFunction(Function& F) { - // Implement the RPO version of the SCCVN algorithm. Conceptually, - // we optimisitically assume that all instructions with the same opcode have - // the same VN. Then we deepen our comparison by one level, to all - // instructions whose operands have the same opcodes get the same VN. We - // iterate this process until the partitioning stops changing, at which - // point we have computed a full numbering. - ReversePostOrderTraversal<Function*> RPOT(&F); - bool done = false; - while (!done) { - done = true; - VT.clearExpressions(); - for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), - E = RPOT.end(); I != E; ++I) { - BasicBlock* BB = *I; - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE; ++BI) { - uint32_t origVN = VT.lookup(BI); - uint32_t newVN = VT.computeNumber(BI); - if (origVN != newVN) - done = false; - } - } - } - - // Now, do a dominator walk, eliminating simple, dominated redundancies as we - // go. Also, build the ValueNumberScope structure that will be used for - // computing full availability. - DominatorTree& DT = getAnalysis<DominatorTree>(); - bool changed = false; - for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()), - DE = df_end(DT.getRootNode()); DI != DE; ++DI) { - BasicBlock* BB = DI->getBlock(); - if (DI->getIDom()) - BBMap[BB] = new ValueNumberScope(BBMap[DI->getIDom()->getBlock()]); - else - BBMap[BB] = new ValueNumberScope(0); - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - uint32_t num = VT.lookup(I); - Value* repl = lookupNumber(BBMap[BB], num); - - if (repl) { - if (isa<PHINode>(I)) - ++NumSCCVNPhi; - else - ++NumSCCVNInstr; - I->replaceAllUsesWith(repl); - Instruction* OldInst = I; - ++I; - BBMap[BB]->table[num] = repl; - OldInst->eraseFromParent(); - changed = true; - } else { - BBMap[BB]->table[num] = I; - BBMap[BB]->availOut.set(num); - - ++I; - } - } - } - - // Perform a forward data-flow to compute availability at all points on - // the CFG. - do { - changed = false; - for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), - E = RPOT.end(); I != E; ++I) { - BasicBlock* BB = *I; - ValueNumberScope *VNS = BBMap[BB]; - - SparseBitVector<128> preds; - bool first = true; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); - PI != PE; ++PI) { - if (first) { - preds = BBMap[*PI]->availOut; - first = false; - } else { - preds &= BBMap[*PI]->availOut; - } - } - - changed |= (VNS->availIn |= preds); - changed |= (VNS->availOut |= preds); - } - } while (changed); - - // Use full availability information to perform non-dominated replacements. - SSAUpdater SSU; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { - if (!BBMap.count(FI)) continue; - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); - BI != BE; ) { - uint32_t num = VT.lookup(BI); - if (!BBMap[FI]->availIn.test(num)) { - ++BI; - continue; - } - - SSU.Initialize(BI); - - SmallPtrSet<BasicBlock*, 8> visited; - SmallVector<BasicBlock*, 8> stack; - visited.insert(FI); - for (pred_iterator PI = pred_begin(FI), PE = pred_end(FI); - PI != PE; ++PI) - if (!visited.count(*PI)) - stack.push_back(*PI); - - while (!stack.empty()) { - BasicBlock* CurrBB = stack.pop_back_val(); - visited.insert(CurrBB); - - ValueNumberScope* S = BBMap[CurrBB]; - if (S->table.count(num)) { - SSU.AddAvailableValue(CurrBB, S->table[num]); - } else { - for (pred_iterator PI = pred_begin(CurrBB), PE = pred_end(CurrBB); - PI != PE; ++PI) - if (!visited.count(*PI)) - stack.push_back(*PI); - } - } - - Value* repl = SSU.GetValueInMiddleOfBlock(FI); - BI->replaceAllUsesWith(repl); - Instruction* CurInst = BI; - ++BI; - BBMap[FI]->table[num] = repl; - if (isa<PHINode>(CurInst)) - ++NumSCCVNPhi; - else - ++NumSCCVNInstr; - - CurInst->eraseFromParent(); - } - } - - VT.clear(); - for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator - I = BBMap.begin(), E = BBMap.end(); I != E; ++I) - delete I->second; - BBMap.clear(); - - return changed; -} diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 6211beb70ba2..5ca9ce376f25 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -130,14 +130,7 @@ namespace { void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); - bool CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy, - bool &SawVec, uint64_t Offset, unsigned AllocaSize); - void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); - Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, - uint64_t Offset, IRBuilder<> &Builder); - Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, - uint64_t Offset, IRBuilder<> &Builder); - static Instruction *isOnlyCopiedFromConstantGlobal(AllocaInst *AI); + static MemTransferInst *isOnlyCopiedFromConstantGlobal(AllocaInst *AI); }; } @@ -150,6 +143,596 @@ FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { } +//===----------------------------------------------------------------------===// +// Convert To Scalar Optimization. +//===----------------------------------------------------------------------===// + +namespace { +/// ConvertToScalarInfo - This class implements the "Convert To Scalar" +/// optimization, which scans the uses of an alloca and determines if it can +/// rewrite it in terms of a single new alloca that can be mem2reg'd. +class ConvertToScalarInfo { + /// AllocaSize - The size of the alloca being considered. + unsigned AllocaSize; + const TargetData &TD; + + /// IsNotTrivial - This is set to true if there is some access to the object + /// which means that mem2reg can't promote it. + bool IsNotTrivial; + + /// VectorTy - This tracks the type that we should promote the vector to if + /// it is possible to turn it into a vector. This starts out null, and if it + /// isn't possible to turn into a vector type, it gets set to VoidTy. + const Type *VectorTy; + + /// HadAVector - True if there is at least one vector access to the alloca. + /// We don't want to turn random arrays into vectors and use vector element + /// insert/extract, but if there are element accesses to something that is + /// also declared as a vector, we do want to promote to a vector. + bool HadAVector; + +public: + explicit ConvertToScalarInfo(unsigned Size, const TargetData &td) + : AllocaSize(Size), TD(td) { + IsNotTrivial = false; + VectorTy = 0; + HadAVector = false; + } + + AllocaInst *TryConvert(AllocaInst *AI); + +private: + bool CanConvertToScalar(Value *V, uint64_t Offset); + void MergeInType(const Type *In, uint64_t Offset); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); + + Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, + uint64_t Offset, IRBuilder<> &Builder); + Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, + uint64_t Offset, IRBuilder<> &Builder); +}; +} // end anonymous namespace. + +/// TryConvert - Analyze the specified alloca, and if it is safe to do so, +/// rewrite it to be a new alloca which is mem2reg'able. This returns the new +/// alloca if possible or null if not. +AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { + // If we can't convert this scalar, or if mem2reg can trivially do it, bail + // out. + if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) + return 0; + + // If we were able to find a vector type that can handle this with + // insert/extract elements, and if there was at least one use that had + // a vector type, promote this to a vector. We don't want to promote + // random stuff that doesn't use vectors (e.g. <9 x double>) because then + // we just get a lot of insert/extracts. If at least one vector is + // involved, then we probably really do have a union of vector/array. + const Type *NewTy; + if (VectorTy && VectorTy->isVectorTy() && HadAVector) { + DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " + << *VectorTy << '\n'); + NewTy = VectorTy; // Use the vector type. + } else { + DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); + // Create and insert the integer alloca. + NewTy = IntegerType::get(AI->getContext(), AllocaSize*8); + } + AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); + ConvertUsesToScalar(AI, NewAI, 0); + return NewAI; +} + +/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy) +/// so far at the offset specified by Offset (which is specified in bytes). +/// +/// There are two cases we handle here: +/// 1) A union of vector types of the same size and potentially its elements. +/// Here we turn element accesses into insert/extract element operations. +/// This promotes a <4 x float> with a store of float to the third element +/// into a <4 x float> that uses insert element. +/// 2) A fully general blob of memory, which we turn into some (potentially +/// large) integer type with extract and insert operations where the loads +/// and stores would mutate the memory. We mark this by setting VectorTy +/// to VoidTy. +void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { + // If we already decided to turn this into a blob of integer memory, there is + // nothing to be done. + if (VectorTy && VectorTy->isVoidTy()) + return; + + // If this could be contributing to a vector, analyze it. + + // If the In type is a vector that is the same size as the alloca, see if it + // matches the existing VecTy. + if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { + // Remember if we saw a vector type. + HadAVector = true; + + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { + // If we're storing/loading a vector of the right size, allow it as a + // vector. If this the first vector we see, remember the type so that + // we know the element size. If this is a subsequent access, ignore it + // even if it is a differing type but the same size. Worst case we can + // bitcast the resultant vectors. + if (VectorTy == 0) + VectorTy = VInTy; + return; + } + } else if (In->isFloatTy() || In->isDoubleTy() || + (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && + isPowerOf2_32(In->getPrimitiveSizeInBits()))) { + // If we're accessing something that could be an element of a vector, see + // if the implied vector agrees with what we already have and if Offset is + // compatible with it. + unsigned EltSize = In->getPrimitiveSizeInBits()/8; + if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && + (VectorTy == 0 || + cast<VectorType>(VectorTy)->getElementType() + ->getPrimitiveSizeInBits()/8 == EltSize)) { + if (VectorTy == 0) + VectorTy = VectorType::get(In, AllocaSize/EltSize); + return; + } + } + + // Otherwise, we have a case that we can't handle with an optimized vector + // form. We can still turn this into a large integer. + VectorTy = Type::getVoidTy(In->getContext()); +} + +/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all +/// its accesses to a single vector type, return true and set VecTy to +/// the new type. If we could convert the alloca into a single promotable +/// integer, return true but set VecTy to VoidTy. Further, if the use is not a +/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset +/// is the current offset from the base of the alloca being analyzed. +/// +/// If we see at least one access to the value that is as a vector type, set the +/// SawVec flag. +bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // Don't break volatile loads. + if (LI->isVolatile()) + return false; + MergeInType(LI->getType(), Offset); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Storing the pointer, not into the value? + if (SI->getOperand(0) == V || SI->isVolatile()) return false; + MergeInType(SI->getOperand(0)->getType(), Offset); + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + IsNotTrivial = true; // Can't be mem2reg'd. + if (!CanConvertToScalar(BCI, Offset)) + return false; + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // If this is a GEP with a variable indices, we can't handle it. + if (!GEP->hasAllConstantIndices()) + return false; + + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + &Indices[0], Indices.size()); + // See if all uses can be converted. + if (!CanConvertToScalar(GEP, Offset+GEPOffset)) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // handle it. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + // Store of constant value and constant size. + if (!isa<ConstantInt>(MSI->getValue()) || + !isa<ConstantInt>(MSI->getLength())) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); + if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) + return false; + + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // Otherwise, we cannot handle this! + return false; + } + + return true; +} + +/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca +/// directly. This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. By the end of this, there should be no uses of Ptr. +void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, + uint64_t Offset) { + while (!Ptr->use_empty()) { + Instruction *User = cast<Instruction>(Ptr->use_back()); + + if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { + ConvertUsesToScalar(CI, NewAI, Offset); + CI->eraseFromParent(); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + &Indices[0], Indices.size()); + ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); + GEP->eraseFromParent(); + continue; + } + + IRBuilder<> Builder(User->getParent(), User); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // The load is a bit extract from NewAI shifted right by Offset bits. + Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); + Value *NewLoadVal + = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); + LI->replaceAllUsesWith(NewLoadVal); + LI->eraseFromParent(); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + assert(SI->getOperand(0) != Ptr && "Consistency error!"); + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, + Builder); + Builder.CreateStore(New, NewAI); + SI->eraseFromParent(); + + // If the load we just inserted is now dead, then the inserted store + // overwrote the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // transform it into a store of the expanded constant value. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + assert(MSI->getRawDest() == Ptr && "Consistency error!"); + unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if (NumBytes != 0) { + unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); + + // Compute the value replicated the right number of times. + APInt APVal(NumBytes*8, Val); + + // Splat the value if non-zero. + if (Val) + for (unsigned i = 1; i != NumBytes; ++i) + APVal |= APVal << 8; + + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue( + ConstantInt::get(User->getContext(), APVal), + Old, Offset, Builder); + Builder.CreateStore(New, NewAI); + + // If the load we just inserted is now dead, then the memset overwrote + // the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + } + MSI->eraseFromParent(); + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + assert(Offset == 0 && "must be store to start of alloca"); + + // If the source and destination are both to the same alloca, then this is + // a noop copy-to-self, just delete it. Otherwise, emit a load and store + // as appropriate. + AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject(0)); + + if (MTI->getSource()->getUnderlyingObject(0) != OrigAI) { + // Dest must be OrigAI, change this to be a load from the original + // pointer (bitcasted), then a store to our new alloca. + assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); + Value *SrcPtr = MTI->getSource(); + SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType()); + + LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); + SrcVal->setAlignment(MTI->getAlignment()); + Builder.CreateStore(SrcVal, NewAI); + } else if (MTI->getDest()->getUnderlyingObject(0) != OrigAI) { + // Src must be OrigAI, change this to be a load from NewAI then a store + // through the original dest pointer (bitcasted). + assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); + LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); + + Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType()); + StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); + NewStore->setAlignment(MTI->getAlignment()); + } else { + // Noop transfer. Src == Dst + } + + MTI->eraseFromParent(); + continue; + } + + llvm_unreachable("Unsupported operation!"); + } +} + +/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer +/// or vector value FromVal, extracting the bits from the offset specified by +/// Offset. This returns the value, which is of type ToType. +/// +/// This happens when we are converting an "integer union" to a single +/// integer scalar, or when we are converting a "vector union" to a vector with +/// insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, + uint64_t Offset, IRBuilder<> &Builder) { + // If the load is of the whole new alloca, no conversion is needed. + if (FromVal->getType() == ToType && Offset == 0) + return FromVal; + + // If the result alloca is a vector type, this is either an element + // access or a bitcast to another vector type of the same size. + if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) { + if (ToType->isVectorTy()) + return Builder.CreateBitCast(FromVal, ToType, "tmp"); + + // Otherwise it must be an element access. + unsigned Elt = 0; + if (Offset) { + unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + Elt = Offset/EltSize; + assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); + } + // Return the element extracted out of it. + Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( + Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); + if (V->getType() != ToType) + V = Builder.CreateBitCast(V, ToType, "tmp"); + return V; + } + + // If ToType is a first class aggregate, extract out each of the pieces and + // use insertvalue's to form the FCA. + if (const StructType *ST = dyn_cast<StructType>(ToType)) { + const StructLayout &Layout = *TD.getStructLayout(ST); + Value *Res = UndefValue::get(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), + Offset+Layout.getElementOffsetInBits(i), + Builder); + Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + } + return Res; + } + + if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + Value *Res = UndefValue::get(AT); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), + Offset+i*EltSize, Builder); + Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + } + return Res; + } + + // Otherwise, this must be a union that was converted to an integer value. + const IntegerType *NTy = cast<IntegerType>(FromVal->getType()); + + // If this is a big-endian system and the load is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = TD.getTypeStoreSizeInBits(NTy) - + TD.getTypeStoreSizeInBits(ToType) - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shl) which are not defined. + // We do this to support (f.e.) loads off the end of a structure where + // only some bits are used. + if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateLShr(FromVal, + ConstantInt::get(FromVal->getType(), + ShAmt), "tmp"); + else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateShl(FromVal, + ConstantInt::get(FromVal->getType(), + -ShAmt), "tmp"); + + // Finally, unconditionally truncate the integer to the right width. + unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); + if (LIBitWidth < NTy->getBitWidth()) + FromVal = + Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth), "tmp"); + else if (LIBitWidth > NTy->getBitWidth()) + FromVal = + Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth), "tmp"); + + // If the result is an integer, this is a trunc or bitcast. + if (ToType->isIntegerTy()) { + // Should be done. + } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { + // Just do a bitcast, we know the sizes match up. + FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); + } else { + // Otherwise must be a pointer. + FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); + } + assert(FromVal->getType() == ToType && "Didn't convert right?"); + return FromVal; +} + +/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer +/// or vector value "Old" at the offset specified by Offset. +/// +/// This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_InsertValue(Value *SV, Value *Old, + uint64_t Offset, IRBuilder<> &Builder) { + // Convert the stored type to the actual type, shift it left to insert + // then 'or' into place. + const Type *AllocaType = Old->getType(); + LLVMContext &Context = Old->getContext(); + + if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { + uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); + uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); + + // Changing the whole vector with memset or with an access of a different + // vector type? + if (ValSize == VecSize) + return Builder.CreateBitCast(SV, AllocaType, "tmp"); + + uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + + // Must be an element insertion. + unsigned Elt = Offset/EltSize; + + if (SV->getType() != VTy->getElementType()) + SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); + + SV = Builder.CreateInsertElement(Old, SV, + ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), + "tmp"); + return SV; + } + + // If SV is a first-class aggregate value, insert each value recursively. + if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { + const StructLayout &Layout = *TD.getStructLayout(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Old = ConvertScalar_InsertValue(Elt, Old, + Offset+Layout.getElementOffsetInBits(i), + Builder); + } + return Old; + } + + if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); + } + return Old; + } + + // If SV is a float, convert it to the appropriate integer type. + // If it is a pointer, do the same. + unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType()); + unsigned DestWidth = TD.getTypeSizeInBits(AllocaType); + unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); + unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); + if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) + SV = Builder.CreateBitCast(SV, + IntegerType::get(SV->getContext(),SrcWidth), "tmp"); + else if (SV->getType()->isPointerTy()) + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp"); + + // Zero extend or truncate the value if needed. + if (SV->getType() != AllocaType) { + if (SV->getType()->getPrimitiveSizeInBits() < + AllocaType->getPrimitiveSizeInBits()) + SV = Builder.CreateZExt(SV, AllocaType, "tmp"); + else { + // Truncation may be needed if storing more than the alloca can hold + // (undefined behavior). + SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); + SrcWidth = DestWidth; + SrcStoreWidth = DestStoreWidth; + } + } + + // If this is a big-endian system and the store is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = DestStoreWidth - SrcStoreWidth - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shr) which are not defined. + // We do this to support (f.e.) stores off the end of a structure where + // only some bits in the structure are set. + APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); + if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { + SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), + ShAmt), "tmp"); + Mask <<= ShAmt; + } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { + SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), + -ShAmt), "tmp"); + Mask = Mask.lshr(-ShAmt); + } + + // Mask out the bits we are about to insert from the old value, and or + // in the new bits. + if (SrcWidth != DestWidth) { + assert(DestWidth > SrcWidth); + Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); + SV = Builder.CreateOr(Old, SV, "ins"); + } + return SV; +} + + +//===----------------------------------------------------------------------===// +// SRoA Driver +//===----------------------------------------------------------------------===// + + bool SROA::runOnFunction(Function &F) { TD = getAnalysisIfAvailable<TargetData>(); @@ -202,6 +785,7 @@ bool SROA::performPromotion(Function &F) { return Changed; } + /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. static bool ShouldAttemptScalarRepl(AllocaInst *AI) { @@ -216,6 +800,7 @@ static bool ShouldAttemptScalarRepl(AllocaInst *AI) { return false; } + // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the malloc/alloca instructions in the function, removing // them if they are only used by getelementptr instructions. @@ -223,7 +808,7 @@ static bool ShouldAttemptScalarRepl(AllocaInst *AI) { bool SROA::performScalarRepl(Function &F) { std::vector<AllocaInst*> WorkList; - // Scan the entry basic block, adding any alloca's and mallocs to the worklist + // Scan the entry basic block, adding allocas to the worklist. BasicBlock &BB = F.getEntryBlock(); for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) if (AllocaInst *A = dyn_cast<AllocaInst>(I)) @@ -239,6 +824,7 @@ bool SROA::performScalarRepl(Function &F) { // with unused elements. if (AI->use_empty()) { AI->eraseFromParent(); + Changed = true; continue; } @@ -251,10 +837,10 @@ bool SROA::performScalarRepl(Function &F) { // the constant global instead. This is commonly produced by the CFE by // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' // is only subsequently read. - if (Instruction *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) { + if (MemTransferInst *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) { DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); DEBUG(dbgs() << " memcpy = " << *TheCopy << '\n'); - Constant *TheSrc = cast<Constant>(TheCopy->getOperand(2)); + Constant *TheSrc = cast<Constant>(TheCopy->getSource()); AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); TheCopy->eraseFromParent(); // Don't mutate the global. AI->eraseFromParent(); @@ -271,7 +857,10 @@ bool SROA::performScalarRepl(Function &F) { // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; - + + // Do not promote any struct whose size is too big. + if (AllocaSize > SRThreshold) continue; + // If the alloca looks like a good candidate for scalar replacement, and if // all its users can be transformed, then split up the aggregate into its // separate elements. @@ -281,48 +870,20 @@ bool SROA::performScalarRepl(Function &F) { continue; } - // Do not promote any struct whose size is too big. - if (AllocaSize > SRThreshold) continue; - // If we can turn this aggregate value (potentially with casts) into a // simple scalar value that can be mem2reg'd into a register value. // IsNotTrivial tracks whether this is something that mem2reg could have // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. - bool IsNotTrivial = false; - const Type *VectorTy = 0; - bool HadAVector = false; - if (CanConvertToScalar(AI, IsNotTrivial, VectorTy, HadAVector, - 0, unsigned(AllocaSize)) && IsNotTrivial) { - AllocaInst *NewAI; - // If we were able to find a vector type that can handle this with - // insert/extract elements, and if there was at least one use that had - // a vector type, promote this to a vector. We don't want to promote - // random stuff that doesn't use vectors (e.g. <9 x double>) because then - // we just get a lot of insert/extracts. If at least one vector is - // involved, then we probably really do have a union of vector/array. - if (VectorTy && VectorTy->isVectorTy() && HadAVector) { - DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " - << *VectorTy << '\n'); - - // Create and insert the vector alloca. - NewAI = new AllocaInst(VectorTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0); - } else { - DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); - - // Create and insert the integer alloca. - const Type *NewTy = IntegerType::get(AI->getContext(), AllocaSize*8); - NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0); - } + if (AllocaInst *NewAI = + ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; Changed = true; continue; - } + } // Otherwise, couldn't process this alloca. } @@ -698,7 +1259,6 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // that doesn't have anything to do with the alloca that we are promoting. For // memset, this Value* stays null. Value *OtherPtr = 0; - LLVMContext &Context = MI->getContext(); unsigned MemAlignment = MI->getAlignment(); if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy if (Inst == MTI->getRawDest()) @@ -756,7 +1316,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, } // Process each element of the aggregate. - Value *TheFn = MI->getOperand(0); + Value *TheFn = MI->getCalledValue(); const Type *BytePtrTy = MI->getRawDest()->getType(); bool SROADest = MI->getRawDest() == Inst; @@ -775,12 +1335,11 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, MI); uint64_t EltOffset; const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); - if (const StructType *ST = - dyn_cast<StructType>(OtherPtrTy->getElementType())) { + const Type *OtherTy = OtherPtrTy->getElementType(); + if (const StructType *ST = dyn_cast<StructType>(OtherTy)) { EltOffset = TD->getStructLayout(ST)->getElementOffset(i); } else { - const Type *EltTy = - cast<SequentialType>(OtherPtr->getType())->getElementType(); + const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); EltOffset = TD->getTypeAllocSize(EltTy)*i; } @@ -832,7 +1391,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, } // Convert the integer value to the appropriate type. - StoreVal = ConstantInt::get(Context, TotalVal); + StoreVal = ConstantInt::get(CI->getContext(), TotalVal); if (ValTy->isPointerTy()) StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); else if (ValTy->isFloatingPointTy()) @@ -1174,509 +1733,6 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return true; } -/// MergeInType - Add the 'In' type to the accumulated type (Accum) so far at -/// the offset specified by Offset (which is specified in bytes). -/// -/// There are two cases we handle here: -/// 1) A union of vector types of the same size and potentially its elements. -/// Here we turn element accesses into insert/extract element operations. -/// This promotes a <4 x float> with a store of float to the third element -/// into a <4 x float> that uses insert element. -/// 2) A fully general blob of memory, which we turn into some (potentially -/// large) integer type with extract and insert operations where the loads -/// and stores would mutate the memory. -static void MergeInType(const Type *In, uint64_t Offset, const Type *&VecTy, - unsigned AllocaSize, const TargetData &TD, - LLVMContext &Context) { - // If this could be contributing to a vector, analyze it. - if (VecTy != Type::getVoidTy(Context)) { // either null or a vector type. - - // If the In type is a vector that is the same size as the alloca, see if it - // matches the existing VecTy. - if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { - if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { - // If we're storing/loading a vector of the right size, allow it as a - // vector. If this the first vector we see, remember the type so that - // we know the element size. - if (VecTy == 0) - VecTy = VInTy; - return; - } - } else if (In->isFloatTy() || In->isDoubleTy() || - (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && - isPowerOf2_32(In->getPrimitiveSizeInBits()))) { - // If we're accessing something that could be an element of a vector, see - // if the implied vector agrees with what we already have and if Offset is - // compatible with it. - unsigned EltSize = In->getPrimitiveSizeInBits()/8; - if (Offset % EltSize == 0 && - AllocaSize % EltSize == 0 && - (VecTy == 0 || - cast<VectorType>(VecTy)->getElementType() - ->getPrimitiveSizeInBits()/8 == EltSize)) { - if (VecTy == 0) - VecTy = VectorType::get(In, AllocaSize/EltSize); - return; - } - } - } - - // Otherwise, we have a case that we can't handle with an optimized vector - // form. We can still turn this into a large integer. - VecTy = Type::getVoidTy(Context); -} - -/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all -/// its accesses to a single vector type, return true and set VecTy to -/// the new type. If we could convert the alloca into a single promotable -/// integer, return true but set VecTy to VoidTy. Further, if the use is not a -/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset -/// is the current offset from the base of the alloca being analyzed. -/// -/// If we see at least one access to the value that is as a vector type, set the -/// SawVec flag. -bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy, - bool &SawVec, uint64_t Offset, - unsigned AllocaSize) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - // Don't break volatile loads. - if (LI->isVolatile()) - return false; - MergeInType(LI->getType(), Offset, VecTy, - AllocaSize, *TD, V->getContext()); - SawVec |= LI->getType()->isVectorTy(); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - // Storing the pointer, not into the value? - if (SI->getOperand(0) == V || SI->isVolatile()) return 0; - MergeInType(SI->getOperand(0)->getType(), Offset, - VecTy, AllocaSize, *TD, V->getContext()); - SawVec |= SI->getOperand(0)->getType()->isVectorTy(); - continue; - } - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { - if (!CanConvertToScalar(BCI, IsNotTrivial, VecTy, SawVec, Offset, - AllocaSize)) - return false; - IsNotTrivial = true; - continue; - } - - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { - // If this is a GEP with a variable indices, we can't handle it. - if (!GEP->hasAllConstantIndices()) - return false; - - // Compute the offset that this GEP adds to the pointer. - SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - uint64_t GEPOffset = TD->getIndexedOffset(GEP->getPointerOperandType(), - &Indices[0], Indices.size()); - // See if all uses can be converted. - if (!CanConvertToScalar(GEP, IsNotTrivial, VecTy, SawVec,Offset+GEPOffset, - AllocaSize)) - return false; - IsNotTrivial = true; - continue; - } - - // If this is a constant sized memset of a constant value (e.g. 0) we can - // handle it. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { - // Store of constant value and constant size. - if (isa<ConstantInt>(MSI->getValue()) && - isa<ConstantInt>(MSI->getLength())) { - IsNotTrivial = true; - continue; - } - } - - // If this is a memcpy or memmove into or out of the whole allocation, we - // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { - if (ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength())) - if (Len->getZExtValue() == AllocaSize && Offset == 0) { - IsNotTrivial = true; - continue; - } - } - - // Otherwise, we cannot handle this! - return false; - } - - return true; -} - -/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca -/// directly. This happens when we are converting an "integer union" to a -/// single integer scalar, or when we are converting a "vector union" to a -/// vector with insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. By the end of this, there should be no uses of Ptr. -void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) { - while (!Ptr->use_empty()) { - Instruction *User = cast<Instruction>(Ptr->use_back()); - - if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { - ConvertUsesToScalar(CI, NewAI, Offset); - CI->eraseFromParent(); - continue; - } - - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { - // Compute the offset that this GEP adds to the pointer. - SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - uint64_t GEPOffset = TD->getIndexedOffset(GEP->getPointerOperandType(), - &Indices[0], Indices.size()); - ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); - GEP->eraseFromParent(); - continue; - } - - IRBuilder<> Builder(User->getParent(), User); - - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - // The load is a bit extract from NewAI shifted right by Offset bits. - Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); - Value *NewLoadVal - = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); - LI->replaceAllUsesWith(NewLoadVal); - LI->eraseFromParent(); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - assert(SI->getOperand(0) != Ptr && "Consistency error!"); - Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); - Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, - Builder); - Builder.CreateStore(New, NewAI); - SI->eraseFromParent(); - - // If the load we just inserted is now dead, then the inserted store - // overwrote the entire thing. - if (Old->use_empty()) - Old->eraseFromParent(); - continue; - } - - // If this is a constant sized memset of a constant value (e.g. 0) we can - // transform it into a store of the expanded constant value. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { - assert(MSI->getRawDest() == Ptr && "Consistency error!"); - unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); - if (NumBytes != 0) { - unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); - - // Compute the value replicated the right number of times. - APInt APVal(NumBytes*8, Val); - - // Splat the value if non-zero. - if (Val) - for (unsigned i = 1; i != NumBytes; ++i) - APVal |= APVal << 8; - - Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); - Value *New = ConvertScalar_InsertValue( - ConstantInt::get(User->getContext(), APVal), - Old, Offset, Builder); - Builder.CreateStore(New, NewAI); - - // If the load we just inserted is now dead, then the memset overwrote - // the entire thing. - if (Old->use_empty()) - Old->eraseFromParent(); - } - MSI->eraseFromParent(); - continue; - } - - // If this is a memcpy or memmove into or out of the whole allocation, we - // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { - assert(Offset == 0 && "must be store to start of alloca"); - - // If the source and destination are both to the same alloca, then this is - // a noop copy-to-self, just delete it. Otherwise, emit a load and store - // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject(0)); - - if (MTI->getSource()->getUnderlyingObject(0) != OrigAI) { - // Dest must be OrigAI, change this to be a load from the original - // pointer (bitcasted), then a store to our new alloca. - assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); - Value *SrcPtr = MTI->getSource(); - SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType()); - - LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); - SrcVal->setAlignment(MTI->getAlignment()); - Builder.CreateStore(SrcVal, NewAI); - } else if (MTI->getDest()->getUnderlyingObject(0) != OrigAI) { - // Src must be OrigAI, change this to be a load from NewAI then a store - // through the original dest pointer (bitcasted). - assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); - LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); - - Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType()); - StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); - NewStore->setAlignment(MTI->getAlignment()); - } else { - // Noop transfer. Src == Dst - } - - MTI->eraseFromParent(); - continue; - } - - llvm_unreachable("Unsupported operation!"); - } -} - -/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer -/// or vector value FromVal, extracting the bits from the offset specified by -/// Offset. This returns the value, which is of type ToType. -/// -/// This happens when we are converting an "integer union" to a single -/// integer scalar, or when we are converting a "vector union" to a vector with -/// insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. -Value *SROA::ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, - uint64_t Offset, IRBuilder<> &Builder) { - // If the load is of the whole new alloca, no conversion is needed. - if (FromVal->getType() == ToType && Offset == 0) - return FromVal; - - // If the result alloca is a vector type, this is either an element - // access or a bitcast to another vector type of the same size. - if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) { - if (ToType->isVectorTy()) - return Builder.CreateBitCast(FromVal, ToType, "tmp"); - - // Otherwise it must be an element access. - unsigned Elt = 0; - if (Offset) { - unsigned EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType()); - Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - } - // Return the element extracted out of it. - Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( - Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); - if (V->getType() != ToType) - V = Builder.CreateBitCast(V, ToType, "tmp"); - return V; - } - - // If ToType is a first class aggregate, extract out each of the pieces and - // use insertvalue's to form the FCA. - if (const StructType *ST = dyn_cast<StructType>(ToType)) { - const StructLayout &Layout = *TD->getStructLayout(ST); - Value *Res = UndefValue::get(ST); - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), - Offset+Layout.getElementOffsetInBits(i), - Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); - } - return Res; - } - - if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { - uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType()); - Value *Res = UndefValue::get(AT); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); - } - return Res; - } - - // Otherwise, this must be a union that was converted to an integer value. - const IntegerType *NTy = cast<IntegerType>(FromVal->getType()); - - // If this is a big-endian system and the load is narrower than the - // full alloca type, we need to do a shift to get the right bits. - int ShAmt = 0; - if (TD->isBigEndian()) { - // On big-endian machines, the lowest bit is stored at the bit offset - // from the pointer given by getTypeStoreSizeInBits. This matters for - // integers with a bitwidth that is not a multiple of 8. - ShAmt = TD->getTypeStoreSizeInBits(NTy) - - TD->getTypeStoreSizeInBits(ToType) - Offset; - } else { - ShAmt = Offset; - } - - // Note: we support negative bitwidths (with shl) which are not defined. - // We do this to support (f.e.) loads off the end of a structure where - // only some bits are used. - if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateLShr(FromVal, - ConstantInt::get(FromVal->getType(), - ShAmt), "tmp"); - else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateShl(FromVal, - ConstantInt::get(FromVal->getType(), - -ShAmt), "tmp"); - - // Finally, unconditionally truncate the integer to the right width. - unsigned LIBitWidth = TD->getTypeSizeInBits(ToType); - if (LIBitWidth < NTy->getBitWidth()) - FromVal = - Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); - else if (LIBitWidth > NTy->getBitWidth()) - FromVal = - Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); - - // If the result is an integer, this is a trunc or bitcast. - if (ToType->isIntegerTy()) { - // Should be done. - } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { - // Just do a bitcast, we know the sizes match up. - FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); - } else { - // Otherwise must be a pointer. - FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); - } - assert(FromVal->getType() == ToType && "Didn't convert right?"); - return FromVal; -} - -/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer -/// or vector value "Old" at the offset specified by Offset. -/// -/// This happens when we are converting an "integer union" to a -/// single integer scalar, or when we are converting a "vector union" to a -/// vector with insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. -Value *SROA::ConvertScalar_InsertValue(Value *SV, Value *Old, - uint64_t Offset, IRBuilder<> &Builder) { - - // Convert the stored type to the actual type, shift it left to insert - // then 'or' into place. - const Type *AllocaType = Old->getType(); - LLVMContext &Context = Old->getContext(); - - if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { - uint64_t VecSize = TD->getTypeAllocSizeInBits(VTy); - uint64_t ValSize = TD->getTypeAllocSizeInBits(SV->getType()); - - // Changing the whole vector with memset or with an access of a different - // vector type? - if (ValSize == VecSize) - return Builder.CreateBitCast(SV, AllocaType, "tmp"); - - uint64_t EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType()); - - // Must be an element insertion. - unsigned Elt = Offset/EltSize; - - if (SV->getType() != VTy->getElementType()) - SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); - - SV = Builder.CreateInsertElement(Old, SV, - ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), - "tmp"); - return SV; - } - - // If SV is a first-class aggregate value, insert each value recursively. - if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { - const StructLayout &Layout = *TD->getStructLayout(ST); - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); - Old = ConvertScalar_InsertValue(Elt, Old, - Offset+Layout.getElementOffsetInBits(i), - Builder); - } - return Old; - } - - if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { - uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType()); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); - } - return Old; - } - - // If SV is a float, convert it to the appropriate integer type. - // If it is a pointer, do the same. - unsigned SrcWidth = TD->getTypeSizeInBits(SV->getType()); - unsigned DestWidth = TD->getTypeSizeInBits(AllocaType); - unsigned SrcStoreWidth = TD->getTypeStoreSizeInBits(SV->getType()); - unsigned DestStoreWidth = TD->getTypeStoreSizeInBits(AllocaType); - if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) - SV = Builder.CreateBitCast(SV, - IntegerType::get(SV->getContext(),SrcWidth), "tmp"); - else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD->getIntPtrType(SV->getContext()), "tmp"); - - // Zero extend or truncate the value if needed. - if (SV->getType() != AllocaType) { - if (SV->getType()->getPrimitiveSizeInBits() < - AllocaType->getPrimitiveSizeInBits()) - SV = Builder.CreateZExt(SV, AllocaType, "tmp"); - else { - // Truncation may be needed if storing more than the alloca can hold - // (undefined behavior). - SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); - SrcWidth = DestWidth; - SrcStoreWidth = DestStoreWidth; - } - } - - // If this is a big-endian system and the store is narrower than the - // full alloca type, we need to do a shift to get the right bits. - int ShAmt = 0; - if (TD->isBigEndian()) { - // On big-endian machines, the lowest bit is stored at the bit offset - // from the pointer given by getTypeStoreSizeInBits. This matters for - // integers with a bitwidth that is not a multiple of 8. - ShAmt = DestStoreWidth - SrcStoreWidth - Offset; - } else { - ShAmt = Offset; - } - - // Note: we support negative bitwidths (with shr) which are not defined. - // We do this to support (f.e.) stores off the end of a structure where - // only some bits in the structure are set. - APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); - if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { - SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), - ShAmt), "tmp"); - Mask <<= ShAmt; - } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { - SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), - -ShAmt), "tmp"); - Mask = Mask.lshr(-ShAmt); - } - - // Mask out the bits we are about to insert from the old value, and or - // in the new bits. - if (SrcWidth != DestWidth) { - assert(DestWidth > SrcWidth); - Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); - SV = Builder.CreateOr(Old, SV, "ins"); - } - return SV; -} - /// PointsToConstantGlobal - Return true if V (possibly indirectly) points to @@ -1699,21 +1755,23 @@ static bool PointsToConstantGlobal(Value *V) { /// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to /// the alloca, and if the source pointer is a pointer to a constant global, we /// can optimize this. -static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, +static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, bool isOffset) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) + User *U = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) // Ignore non-volatile loads, they are always ok. if (!LI->isVolatile()) continue; - if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI)) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { // If uses of the bitcast are ok, we are ok. if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset)) return false; continue; } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { // If the GEP has all zero indices, it doesn't offset the pointer. If it // doesn't, it does. if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, @@ -1724,7 +1782,8 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, // If this is isn't our memcpy/memmove, reject it as something we can't // handle. - if (!isa<MemTransferInst>(*UI)) + MemTransferInst *MI = dyn_cast<MemTransferInst>(U); + if (MI == 0) return false; // If we already have seen a copy, reject the second one. @@ -1737,10 +1796,8 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, // If the memintrinsic isn't using the alloca as the dest, reject it. if (UI.getOperandNo() != 1) return false; - MemIntrinsic *MI = cast<MemIntrinsic>(*UI); - // If the source of the memcpy/move is not a constant global, reject it. - if (!PointsToConstantGlobal(MI->getOperand(2))) + if (!PointsToConstantGlobal(MI->getSource())) return false; // Otherwise, the transform is safe. Remember the copy instruction. @@ -1752,8 +1809,8 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only /// modified by a copy from a constant global. If we can prove this, we can /// replace any uses of the alloca with uses of the global directly. -Instruction *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) { - Instruction *TheCopy = 0; +MemTransferInst *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) { + MemTransferInst *TheCopy = 0; if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false)) return TheCopy; return 0; diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp index 4464961a0799..c3408e77807f 100644 --- a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp @@ -93,7 +93,8 @@ InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs, // Inline the call, taking care of what code ends up where. NewBlock = SplitBlock(NextInst->getParent(), NextInst, this); - bool B = InlineFunction(Call, 0, TD); + InlineFunctionInfo IFI(0, TD); + bool B = InlineFunction(Call, IFI); assert(B && "half_powr didn't inline?"); B=B; BasicBlock *NewBody = NewBlock->getSinglePredecessor(); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 162d902cfa4c..5ad5de2ce6a5 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -59,6 +59,8 @@ #include "llvm/Instructions.h" #include "llvm/Pass.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/CFG.h" #include "llvm/ADT/Statistic.h" using namespace llvm; @@ -328,15 +330,6 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, if (&BB->front() == Ret) // Make sure there is something before the ret... return false; - // If the return is in the entry block, then making this transformation would - // turn infinite recursion into an infinite loop. This transformation is ok - // in theory, but breaks some code like: - // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call - // disable this xform in this case, because the code generator will lower the - // call to fabs into inline code. - if (BB == &F->getEntryBlock()) - return false; - // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI; @@ -356,6 +349,25 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) return false; + // As a special case, detect code like this: + // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call + // and disable this xform in this case, because the code generator will + // lower the call to fabs into inline code. + if (BB == &F->getEntryBlock() && + &BB->front() == CI && &*++BB->begin() == Ret && + callIsSmall(F)) { + // A single-block function with just a call and a return. Check that + // the arguments match. + CallSite::arg_iterator I = CallSite(CI).arg_begin(), + E = CallSite(CI).arg_end(); + Function::arg_iterator FI = F->arg_begin(), + FE = F->arg_end(); + for (; I != E && FI != FE; ++I, ++FI) + if (*I != &*FI) break; + if (I == E && FI == FE) + return false; + } + // If we are introducing accumulator recursion to eliminate associative // operations after the call instruction, this variable contains the initial // value for the accumulator. If this value is set, we actually perform diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp index c70bab5492ef..ea9d1c1b1468 100644 --- a/lib/Transforms/Utils/AddrModeMatcher.cpp +++ b/lib/Transforms/Utils/AddrModeMatcher.cpp @@ -434,19 +434,21 @@ static bool FindAllMemoryUses(Instruction *I, // Loop over all the uses, recursively processing them. for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; ++UI) { - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + User *U = *UI; + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); continue; } - if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) { + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { unsigned opNo = UI.getOperandNo(); if (opNo == 0) return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } - if (CallInst *CI = dyn_cast<CallInst>(*UI)) { + if (CallInst *CI = dyn_cast<CallInst>(U)) { InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); if (IA == 0) return true; @@ -456,7 +458,7 @@ static bool FindAllMemoryUses(Instruction *I, continue; } - if (FindAllMemoryUses(cast<Instruction>(*UI), MemoryUses, ConsideredInsts, + if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, TLI)) return true; } diff --git a/lib/Transforms/Utils/BasicInliner.cpp b/lib/Transforms/Utils/BasicInliner.cpp index c580b8fed98c..f0e31efa30c4 100644 --- a/lib/Transforms/Utils/BasicInliner.cpp +++ b/lib/Transforms/Utils/BasicInliner.cpp @@ -129,7 +129,8 @@ void BasicInlinerImpl::inlineFunctions() { } // Inline - if (InlineFunction(CS, NULL, TD)) { + InlineFunctionInfo IFI(0, TD); + if (InlineFunction(CS, IFI)) { if (Callee->use_empty() && (Callee->hasLocalLinkage() || Callee->hasAvailableExternallyLinkage())) DeadFunctions.insert(Callee); diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index fff817928f34..767fa3a0a6b3 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -90,15 +90,15 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the /// specified pointer arguments. Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, - IRBuilder<> &B, const TargetData *TD) { + IRBuilder<> &B, const TargetData *TD, StringRef Name) { Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); const Type *I8Ptr = B.getInt8PtrTy(); - Value *StrNCpy = M->getOrInsertFunction("strncpy", AttrListPtr::get(AWI, 2), - I8Ptr, I8Ptr, I8Ptr, - Len->getType(), NULL); + Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + I8Ptr, I8Ptr, I8Ptr, + Len->getType(), NULL); CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Len, "strncpy"); if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts())) @@ -373,15 +373,29 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { } bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { + // We really need TargetData for later. + if (!TD) return false; + this->CI = CI; - StringRef Name = CI->getCalledFunction()->getName(); + Function *Callee = CI->getCalledFunction(); + StringRef Name = Callee->getName(); + const FunctionType *FT = Callee->getFunctionType(); BasicBlock *BB = CI->getParent(); - IRBuilder<> B(CI->getParent()->getContext()); + LLVMContext &Context = CI->getParent()->getContext(); + IRBuilder<> B(Context); // Set the builder to the instruction after the call. B.SetInsertPoint(BB, CI); if (Name == "__memcpy_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + if (isFoldable(4, 3, false)) { EmitMemCpy(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3), 1, false, B, TD); @@ -397,6 +411,14 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { } if (Name == "__memmove_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + if (isFoldable(4, 3, false)) { EmitMemMove(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3), 1, false, B, TD); @@ -407,6 +429,14 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { } if (Name == "__memset_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + if (isFoldable(4, 3, false)) { Value *Val = B.CreateIntCast(CI->getOperand(2), B.getInt8Ty(), false); @@ -418,6 +448,15 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { } if (Name == "__strcpy_chk" || Name == "__stpcpy_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 3 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + FT->getParamType(2) != TD->getIntPtrType(Context)) + return 0; + + // If a) we don't have any length information, or b) we know this will // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our // st[rp]cpy_chk call which may fail at runtime if the size is too long. @@ -432,10 +471,18 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { return false; } - if (Name == "__strncpy_chk") { + if (Name == "__strncpy_chk" || Name == "__stpncpy_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + !FT->getParamType(2)->isIntegerTy() || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + if (isFoldable(4, 3, false)) { Value *Ret = EmitStrNCpy(CI->getOperand(1), CI->getOperand(2), - CI->getOperand(3), B, TD); + CI->getOperand(3), B, TD, Name.substr(2, 7)); replaceCall(Ret); return true; } diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 62fc2ec10b14..8ad66dd01afc 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -23,7 +23,7 @@ #include "llvm/LLVMContext.h" #include "llvm/Metadata.h" #include "llvm/Support/CFG.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "ValueMapper.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/DebugInfo.h" #include "llvm/ADT/SmallVector.h" diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index a163f89cc37f..b87c082793e0 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -17,7 +17,7 @@ #include "llvm/DerivedTypes.h" #include "llvm/TypeSymbolTable.h" #include "llvm/Constant.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "ValueMapper.h" using namespace llvm; /// CloneModule - Return an exact copy of the specified module. This is not as diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index b20849485306..b51f751e1317 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -751,7 +751,7 @@ ExtractCodeRegion(const std::vector<BasicBlock*> &code) { // verifyFunction(*oldFunction); DEBUG(if (verifyFunction(*newFunction)) - llvm_report_error("verifyFunction failed!")); + report_fatal_error("verifyFunction failed!")); return newFunction; } diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 75c9ccdd7a93..91390bc7beca 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -15,7 +15,6 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" -#include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" @@ -29,13 +28,11 @@ #include "llvm/Support/CallSite.h" using namespace llvm; -bool llvm::InlineFunction(CallInst *CI, CallGraph *CG, const TargetData *TD, - SmallVectorImpl<AllocaInst*> *StaticAllocas) { - return InlineFunction(CallSite(CI), CG, TD, StaticAllocas); +bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI) { + return InlineFunction(CallSite(CI), IFI); } -bool llvm::InlineFunction(InvokeInst *II, CallGraph *CG, const TargetData *TD, - SmallVectorImpl<AllocaInst*> *StaticAllocas) { - return InlineFunction(CallSite(II), CG, TD, StaticAllocas); +bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) { + return InlineFunction(CallSite(II), IFI); } @@ -75,7 +72,7 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, II->setAttributes(CI->getAttributes()); // Make sure that anything using the call now uses the invoke! This also - // updates the CallGraph if present. + // updates the CallGraph if present, because it uses a WeakVH. CI->replaceAllUsesWith(II); // Delete the unconditional branch inserted by splitBasicBlock @@ -173,7 +170,8 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, static void UpdateCallGraphAfterInlining(CallSite CS, Function::iterator FirstNewBlock, DenseMap<const Value*, Value*> &ValueMap, - CallGraph &CG) { + InlineFunctionInfo &IFI) { + CallGraph &CG = *IFI.CG; const Function *Caller = CS.getInstruction()->getParent()->getParent(); const Function *Callee = CS.getCalledFunction(); CallGraphNode *CalleeNode = CG[Callee]; @@ -201,8 +199,27 @@ static void UpdateCallGraphAfterInlining(CallSite CS, // If the call was inlined, but then constant folded, there is no edge to // add. Check for this case. - if (Instruction *NewCall = dyn_cast<Instruction>(VMI->second)) - CallerNode->addCalledFunction(CallSite::get(NewCall), I->second); + Instruction *NewCall = dyn_cast<Instruction>(VMI->second); + if (NewCall == 0) continue; + + // Remember that this call site got inlined for the client of + // InlineFunction. + IFI.InlinedCalls.push_back(NewCall); + + // It's possible that inlining the callsite will cause it to go from an + // indirect to a direct call by resolving a function pointer. If this + // happens, set the callee of the new call site to a more precise + // destination. This can also happen if the call graph node of the caller + // was just unnecessarily imprecise. + if (I->second->getFunction() == 0) + if (Function *F = CallSite(NewCall).getCalledFunction()) { + // Indirect call site resolved to direct call. + CallerNode->addCalledFunction(CallSite::get(NewCall), CG[F]); + + continue; + } + + CallerNode->addCalledFunction(CallSite::get(NewCall), I->second); } // Update the call graph by deleting the edge from Callee to Caller. We must @@ -219,13 +236,15 @@ static void UpdateCallGraphAfterInlining(CallSite CS, // exists in the instruction stream. Similiarly this will inline a recursive // function by one level. // -bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, - SmallVectorImpl<AllocaInst*> *StaticAllocas) { +bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) { Instruction *TheCall = CS.getInstruction(); LLVMContext &Context = TheCall->getContext(); assert(TheCall->getParent() && TheCall->getParent()->getParent() && "Instruction not in function!"); + // If IFI has any state in it, zap it before we fill it in. + IFI.reset(); + const Function *CalledFunc = CS.getCalledFunction(); if (CalledFunc == 0 || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! @@ -292,7 +311,7 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, // Create the alloca. If we have TargetData, use nice alignment. unsigned Align = 1; - if (TD) Align = TD->getPrefTypeAlignment(AggTy); + if (IFI.TD) Align = IFI.TD->getPrefTypeAlignment(AggTy); Value *NewAlloca = new AllocaInst(AggTy, 0, Align, I->getName(), &*Caller->begin()->begin()); @@ -305,11 +324,11 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, Value *SrcCast = new BitCastInst(*AI, VoidPtrTy, "tmp", TheCall); Value *Size; - if (TD == 0) + if (IFI.TD == 0) Size = ConstantExpr::getSizeOf(AggTy); else Size = ConstantInt::get(Type::getInt64Ty(Context), - TD->getTypeStoreSize(AggTy)); + IFI.TD->getTypeStoreSize(AggTy)); // Always generate a memcpy of alignment 1 here because we don't know // the alignment of the src pointer. Other optimizations can infer @@ -323,7 +342,7 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, CallInst::Create(MemCpyFn, CallArgs, CallArgs+5, "", TheCall); // If we have a call graph, update it. - if (CG) { + if (CallGraph *CG = IFI.CG) { CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn); CallGraphNode *CallerNode = (*CG)[Caller]; CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN); @@ -342,14 +361,14 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, // (which can happen, e.g., because an argument was constant), but we'll be // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, ValueMap, Returns, ".i", - &InlinedFunctionInfo, TD, TheCall); + &InlinedFunctionInfo, IFI.TD, TheCall); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; // Update the callgraph if requested. - if (CG) - UpdateCallGraphAfterInlining(CS, FirstNewBlock, ValueMap, *CG); + if (IFI.CG) + UpdateCallGraphAfterInlining(CS, FirstNewBlock, ValueMap, IFI); } // If there are any alloca instructions in the block that used to be the entry @@ -376,13 +395,13 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, // Keep track of the static allocas that we inline into the caller if the // StaticAllocas pointer is non-null. - if (StaticAllocas) StaticAllocas->push_back(AI); + IFI.StaticAllocas.push_back(AI); // Scan for the block of allocas that we can move over, and move them // all at once. while (isa<AllocaInst>(I) && isa<Constant>(cast<AllocaInst>(I)->getArraySize())) { - if (StaticAllocas) StaticAllocas->push_back(cast<AllocaInst>(I)); + IFI.StaticAllocas.push_back(cast<AllocaInst>(I)); ++I; } @@ -406,7 +425,7 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, // If we are preserving the callgraph, add edges to the stacksave/restore // functions for the calls we insert. CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0; - if (CG) { + if (CallGraph *CG = IFI.CG) { StackSaveCGN = CG->getOrInsertFunction(StackSave); StackRestoreCGN = CG->getOrInsertFunction(StackRestore); CallerNode = (*CG)[Caller]; @@ -415,13 +434,13 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, // Insert the llvm.stacksave. CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack", FirstNewBlock->begin()); - if (CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN); + if (IFI.CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN); // Insert a call to llvm.stackrestore before any return instructions in the // inlined function. for (unsigned i = 0, e = Returns.size(); i != e; ++i) { CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]); - if (CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); + if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); } // Count the number of StackRestore calls we insert. @@ -434,7 +453,7 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD, BB != E; ++BB) if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", UI); - if (CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); + if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); ++NumStackRestores; } } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index ac59b4d7b3e8..84fd1eb175df 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -183,8 +183,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) // For the first iteration of the loop, we should use the precloned values for // PHI nodes. Insert associations now. - typedef DenseMap<const Value*, Value*> ValueMapTy; - ValueMapTy LastValueMap; + typedef DenseMap<const Value*, Value*> ValueToValueMapTy; + ValueToValueMapTy LastValueMap; std::vector<PHINode*> OrigPHINode; for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { PHINode *PN = cast<PHINode>(I); @@ -205,7 +205,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(), E = LoopBlocks.end(); BB != E; ++BB) { - ValueMapTy ValueMap; + ValueToValueMapTy ValueMap; BasicBlock *New = CloneBasicBlock(*BB, ValueMap, "." + Twine(It)); Header->getParent()->getBasicBlockList().push_back(New); @@ -224,7 +224,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) // Update our running map of newest clones LastValueMap[*BB] = New; - for (ValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end(); + for (ValueToValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end(); VI != VE; ++VI) LastValueMap[VI->first] = VI->second; diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp index bbbcc1adae74..0ed8c7227b07 100644 --- a/lib/Transforms/Utils/LowerInvoke.cpp +++ b/lib/Transforms/Utils/LowerInvoke.cpp @@ -70,15 +70,18 @@ namespace { // Used for expensive EH support. const Type *JBLinkTy; GlobalVariable *JBListHead; - Constant *SetJmpFn, *LongJmpFn; + Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn; + bool useExpensiveEHSupport; // We peek in TLI to grab the target's jmp_buf size and alignment const TargetLowering *TLI; public: static char ID; // Pass identification, replacement for typeid - explicit LowerInvoke(const TargetLowering *tli = NULL) - : FunctionPass(&ID), TLI(tli) { } + explicit LowerInvoke(const TargetLowering *tli = NULL, + bool useExpensiveEHSupport = ExpensiveEHSupport) + : FunctionPass(&ID), useExpensiveEHSupport(useExpensiveEHSupport), + TLI(tli) { } bool doInitialization(Module &M); bool runOnFunction(Function &F); @@ -94,7 +97,8 @@ namespace { bool insertCheapEHSupport(Function &F); void splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes); void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, - AllocaInst *InvokeNum, SwitchInst *CatchSwitch); + AllocaInst *InvokeNum, AllocaInst *StackPtr, + SwitchInst *CatchSwitch); bool insertExpensiveEHSupport(Function &F); }; } @@ -107,7 +111,11 @@ const PassInfo *const llvm::LowerInvokePassID = &X; // Public Interface To the LowerInvoke pass. FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) { - return new LowerInvoke(TLI); + return new LowerInvoke(TLI, ExpensiveEHSupport); +} +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI, + bool useExpensiveEHSupport) { + return new LowerInvoke(TLI, useExpensiveEHSupport); } // doInitialization - Make sure that there is a prototype for abort in the @@ -116,7 +124,7 @@ bool LowerInvoke::doInitialization(Module &M) { const Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); AbortMessage = 0; - if (ExpensiveEHSupport) { + if (useExpensiveEHSupport) { // Insert a type for the linked list of jump buffers. unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0; JBSize = JBSize ? JBSize : 200; @@ -160,6 +168,8 @@ bool LowerInvoke::doInitialization(Module &M) { #endif LongJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::longjmp); + StackSaveFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); + StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); } // We need the 'write' and 'abort' functions for both models. @@ -175,7 +185,7 @@ bool LowerInvoke::doInitialization(Module &M) { } void LowerInvoke::createAbortMessage(Module *M) { - if (ExpensiveEHSupport) { + if (useExpensiveEHSupport) { // The abort message for expensive EH support tells the user that the // program 'unwound' without an 'invoke' instruction. Constant *Msg = @@ -229,7 +239,8 @@ bool LowerInvoke::insertCheapEHSupport(Function &F) { std::vector<Value*> CallArgs(II->op_begin(), II->op_end() - 3); // Insert a normal call instruction... CallInst *NewCall = CallInst::Create(II->getCalledValue(), - CallArgs.begin(), CallArgs.end(), "",II); + CallArgs.begin(), CallArgs.end(), + "",II); NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); @@ -270,6 +281,7 @@ bool LowerInvoke::insertCheapEHSupport(Function &F) { /// specified invoke instruction with a call. void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, AllocaInst *InvokeNum, + AllocaInst *StackPtr, SwitchInst *CatchSwitch) { ConstantInt *InvokeNoC = ConstantInt::get(Type::getInt32Ty(II->getContext()), InvokeNo); @@ -288,12 +300,22 @@ void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, // Insert a store of the invoke num before the invoke and store zero into the // location afterward. new StoreInst(InvokeNoC, InvokeNum, true, II); // volatile + + // Insert a store of the stack ptr before the invoke, so we can restore it + // later in the exception case. + CallInst* StackSaveRet = CallInst::Create(StackSaveFn, "ssret", II); + new StoreInst(StackSaveRet, StackPtr, true, II); // volatile BasicBlock::iterator NI = II->getNormalDest()->getFirstNonPHI(); // nonvolatile. new StoreInst(Constant::getNullValue(Type::getInt32Ty(II->getContext())), InvokeNum, false, NI); + Instruction* StackPtrLoad = new LoadInst(StackPtr, "stackptr.restore", true, + II->getUnwindDest()->getFirstNonPHI() + ); + CallInst::Create(StackRestoreFn, StackPtrLoad, "")->insertAfter(StackPtrLoad); + // Add a switch case to our unwind block. CatchSwitch->addCase(InvokeNoC, II->getUnwindDest()); @@ -500,6 +522,12 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { BasicBlock *CatchBB = BasicBlock::Create(F.getContext(), "setjmp.catch", &F); + // Create an alloca which keeps track of the stack pointer before every + // invoke, this allows us to properly restore the stack pointer after + // long jumping. + AllocaInst *StackPtr = new AllocaInst(Type::getInt8PtrTy(F.getContext()), 0, + "stackptr", EntryBB->begin()); + // Create an alloca which keeps track of which invoke is currently // executing. For normal calls it contains zero. AllocaInst *InvokeNum = new AllocaInst(Type::getInt32Ty(F.getContext()), 0, @@ -546,7 +574,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { // At this point, we are all set up, rewrite each invoke instruction. for (unsigned i = 0, e = Invokes.size(); i != e; ++i) - rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, CatchSwitch); + rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, StackPtr, CatchSwitch); } // We know that there is at least one unwind. @@ -622,7 +650,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { } bool LowerInvoke::runOnFunction(Function &F) { - if (ExpensiveEHSupport) + if (useExpensiveEHSupport) return insertExpensiveEHSupport(F); else return insertCheapEHSupport(F); diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index a31235a1f5cb..25d50dbf2a8b 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -11,34 +11,52 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "ssaupdater" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Instructions.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Support/AlignOf.h" +#include "llvm/Support/Allocator.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; -typedef DenseMap<BasicBlock*, TrackingVH<Value> > AvailableValsTy; -typedef std::vector<std::pair<BasicBlock*, TrackingVH<Value> > > - IncomingPredInfoTy; - +/// BBInfo - Per-basic block information used internally by SSAUpdater. +/// The predecessors of each block are cached here since pred_iterator is +/// slow and we need to iterate over the blocks at least a few times. +class SSAUpdater::BBInfo { +public: + BasicBlock *BB; // Back-pointer to the corresponding block. + Value *AvailableVal; // Value to use in this block. + BBInfo *DefBB; // Block that defines the available value. + int BlkNum; // Postorder number. + BBInfo *IDom; // Immediate dominator. + unsigned NumPreds; // Number of predecessor blocks. + BBInfo **Preds; // Array[NumPreds] of predecessor blocks. + PHINode *PHITag; // Marker for existing PHIs that match. + + BBInfo(BasicBlock *ThisBB, Value *V) + : BB(ThisBB), AvailableVal(V), DefBB(V ? this : 0), BlkNum(0), IDom(0), + NumPreds(0), Preds(0), PHITag(0) { } +}; + +typedef DenseMap<BasicBlock*, SSAUpdater::BBInfo*> BBMapTy; + +typedef DenseMap<BasicBlock*, Value*> AvailableValsTy; static AvailableValsTy &getAvailableVals(void *AV) { return *static_cast<AvailableValsTy*>(AV); } -static IncomingPredInfoTy &getIncomingPredInfo(void *IPI) { - return *static_cast<IncomingPredInfoTy*>(IPI); +static BBMapTy *getBBMap(void *BM) { + return static_cast<BBMapTy*>(BM); } - SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI) - : AV(0), PrototypeValue(0), IPI(0), InsertedPHIs(NewPHI) {} + : AV(0), PrototypeValue(0), BM(0), InsertedPHIs(NewPHI) {} SSAUpdater::~SSAUpdater() { delete &getAvailableVals(AV); - delete &getIncomingPredInfo(IPI); } /// Initialize - Reset this object to get ready for a new set of SSA @@ -48,11 +66,6 @@ void SSAUpdater::Initialize(Value *ProtoValue) { AV = new AvailableValsTy(); else getAvailableVals(AV).clear(); - - if (IPI == 0) - IPI = new IncomingPredInfoTy(); - else - getIncomingPredInfo(IPI).clear(); PrototypeValue = ProtoValue; } @@ -73,7 +86,7 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { /// IsEquivalentPHI - Check if PHI has the same incoming value as specified /// in ValueMapping for each predecessor block. -static bool IsEquivalentPHI(PHINode *PHI, +static bool IsEquivalentPHI(PHINode *PHI, DenseMap<BasicBlock*, Value*> &ValueMapping) { unsigned PHINumValues = PHI->getNumIncomingValues(); if (PHINumValues != ValueMapping.size()) @@ -89,38 +102,12 @@ static bool IsEquivalentPHI(PHINode *PHI, return true; } -/// GetExistingPHI - Check if BB already contains a phi node that is equivalent -/// to the specified mapping from predecessor blocks to incoming values. -static Value *GetExistingPHI(BasicBlock *BB, - DenseMap<BasicBlock*, Value*> &ValueMapping) { - PHINode *SomePHI; - for (BasicBlock::iterator It = BB->begin(); - (SomePHI = dyn_cast<PHINode>(It)); ++It) { - if (IsEquivalentPHI(SomePHI, ValueMapping)) - return SomePHI; - } - return 0; -} - -/// GetExistingPHI - Check if BB already contains an equivalent phi node. -/// The InputIt type must be an iterator over std::pair<BasicBlock*, Value*> -/// objects that specify the mapping from predecessor blocks to incoming values. -template<typename InputIt> -static Value *GetExistingPHI(BasicBlock *BB, const InputIt &I, - const InputIt &E) { - // Avoid create the mapping if BB has no phi nodes at all. - if (!isa<PHINode>(BB->begin())) - return 0; - DenseMap<BasicBlock*, Value*> ValueMapping(I, E); - return GetExistingPHI(BB, ValueMapping); -} - /// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is /// live at the end of the specified block. Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) { - assert(getIncomingPredInfo(IPI).empty() && "Unexpected Internal State"); + assert(BM == 0 && "Unexpected Internal State"); Value *Res = GetValueAtEndOfBlockInternal(BB); - assert(getIncomingPredInfo(IPI).empty() && "Unexpected Internal State"); + assert(BM == 0 && "Unexpected Internal State"); return Res; } @@ -146,7 +133,7 @@ Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) { Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // If there is no definition of the renamed variable in this block, just use // GetValueAtEndOfBlock to do our work. - if (!getAvailableVals(AV).count(BB)) + if (!HasValueForBlock(BB)) return GetValueAtEndOfBlock(BB); // Otherwise, we have the hard case. Get the live-in values for each @@ -193,10 +180,18 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { if (SingularValue != 0) return SingularValue; - // Otherwise, we do need a PHI. - if (Value *ExistingPHI = GetExistingPHI(BB, PredValues.begin(), - PredValues.end())) - return ExistingPHI; + // Otherwise, we do need a PHI: check to see if we already have one available + // in this block that produces the right value. + if (isa<PHINode>(BB->begin())) { + DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(), + PredValues.end()); + PHINode *SomePHI; + for (BasicBlock::iterator It = BB->begin(); + (SomePHI = dyn_cast<PHINode>(It)); ++It) { + if (IsEquivalentPHI(SomePHI, ValueMapping)) + return SomePHI; + } + } // Ok, we have no way out, insert a new one now. PHINode *InsertedPHI = PHINode::Create(PrototypeValue->getType(), @@ -226,7 +221,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { /// which use their value in the corresponding predecessor. void SSAUpdater::RewriteUse(Use &U) { Instruction *User = cast<Instruction>(U.getUser()); - + Value *V; if (PHINode *UserPN = dyn_cast<PHINode>(User)) V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U)); @@ -236,161 +231,427 @@ void SSAUpdater::RewriteUse(Use &U) { U.set(V); } - /// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry /// for the specified BB and if so, return it. If not, construct SSA form by -/// walking predecessors inserting PHI nodes as needed until we get to a block -/// where the value is available. -/// +/// first calculating the required placement of PHIs and then inserting new +/// PHIs where needed. Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { AvailableValsTy &AvailableVals = getAvailableVals(AV); + if (Value *V = AvailableVals[BB]) + return V; + + // Pool allocation used internally by GetValueAtEndOfBlock. + BumpPtrAllocator Allocator; + BBMapTy BBMapObj; + BM = &BBMapObj; + + SmallVector<BBInfo*, 100> BlockList; + BuildBlockList(BB, &BlockList, &Allocator); - // Query AvailableVals by doing an insertion of null. - std::pair<AvailableValsTy::iterator, bool> InsertRes = - AvailableVals.insert(std::make_pair(BB, TrackingVH<Value>())); - - // Handle the case when the insertion fails because we have already seen BB. - if (!InsertRes.second) { - // If the insertion failed, there are two cases. The first case is that the - // value is already available for the specified block. If we get this, just - // return the value. - if (InsertRes.first->second != 0) - return InsertRes.first->second; - - // Otherwise, if the value we find is null, then this is the value is not - // known but it is being computed elsewhere in our recursion. This means - // that we have a cycle. Handle this by inserting a PHI node and returning - // it. When we get back to the first instance of the recursion we will fill - // in the PHI node. - return InsertRes.first->second = - PHINode::Create(PrototypeValue->getType(), PrototypeValue->getName(), - &BB->front()); + // Special case: bail out if BB is unreachable. + if (BlockList.size() == 0) { + BM = 0; + return UndefValue::get(PrototypeValue->getType()); } - // Okay, the value isn't in the map and we just inserted a null in the entry - // to indicate that we're processing the block. Since we have no idea what - // value is in this block, we have to recurse through our predecessors. - // - // While we're walking our predecessors, we keep track of them in a vector, - // then insert a PHI node in the end if we actually need one. We could use a - // smallvector here, but that would take a lot of stack space for every level - // of the recursion, just use IncomingPredInfo as an explicit stack. - IncomingPredInfoTy &IncomingPredInfo = getIncomingPredInfo(IPI); - unsigned FirstPredInfoEntry = IncomingPredInfo.size(); - - // As we're walking the predecessors, keep track of whether they are all - // producing the same value. If so, this value will capture it, if not, it - // will get reset to null. We distinguish the no-predecessor case explicitly - // below. - TrackingVH<Value> ExistingValue; + FindDominators(&BlockList); + FindPHIPlacement(&BlockList); + FindAvailableVals(&BlockList); - // We can get our predecessor info by walking the pred_iterator list, but it - // is relatively slow. If we already have PHI nodes in this block, walk one - // of them to get the predecessor list instead. + BM = 0; + return BBMapObj[BB]->DefBB->AvailableVal; +} + +/// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds +/// vector, set Info->NumPreds, and allocate space in Info->Preds. +static void FindPredecessorBlocks(SSAUpdater::BBInfo *Info, + SmallVectorImpl<BasicBlock*> *Preds, + BumpPtrAllocator *Allocator) { + // We can get our predecessor info by walking the pred_iterator list, + // but it is relatively slow. If we already have PHI nodes in this + // block, walk one of them to get the predecessor list instead. + BasicBlock *BB = Info->BB; if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) { - for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) { - BasicBlock *PredBB = SomePhi->getIncomingBlock(i); - Value *PredVal = GetValueAtEndOfBlockInternal(PredBB); - IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal)); + for (unsigned PI = 0, E = SomePhi->getNumIncomingValues(); PI != E; ++PI) + Preds->push_back(SomePhi->getIncomingBlock(PI)); + } else { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Preds->push_back(*PI); + } - // Set ExistingValue to singular value from all predecessors so far. - if (i == 0) - ExistingValue = PredVal; - else if (PredVal != ExistingValue) - ExistingValue = 0; + Info->NumPreds = Preds->size(); + Info->Preds = static_cast<SSAUpdater::BBInfo**> + (Allocator->Allocate(Info->NumPreds * sizeof(SSAUpdater::BBInfo*), + AlignOf<SSAUpdater::BBInfo*>::Alignment)); +} + +/// BuildBlockList - Starting from the specified basic block, traverse back +/// through its predecessors until reaching blocks with known values. Create +/// BBInfo structures for the blocks and append them to the block list. +void SSAUpdater::BuildBlockList(BasicBlock *BB, BlockListTy *BlockList, + BumpPtrAllocator *Allocator) { + AvailableValsTy &AvailableVals = getAvailableVals(AV); + BBMapTy *BBMap = getBBMap(BM); + SmallVector<BBInfo*, 10> RootList; + SmallVector<BBInfo*, 64> WorkList; + + BBInfo *Info = new (*Allocator) BBInfo(BB, 0); + (*BBMap)[BB] = Info; + WorkList.push_back(Info); + + // Search backward from BB, creating BBInfos along the way and stopping when + // reaching blocks that define the value. Record those defining blocks on + // the RootList. + SmallVector<BasicBlock*, 10> Preds; + while (!WorkList.empty()) { + Info = WorkList.pop_back_val(); + Preds.clear(); + FindPredecessorBlocks(Info, &Preds, Allocator); + + // Treat an unreachable predecessor as a definition with 'undef'. + if (Info->NumPreds == 0) { + Info->AvailableVal = UndefValue::get(PrototypeValue->getType()); + Info->DefBB = Info; + RootList.push_back(Info); + continue; } - } else { - bool isFirstPred = true; - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { - BasicBlock *PredBB = *PI; - Value *PredVal = GetValueAtEndOfBlockInternal(PredBB); - IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal)); - // Set ExistingValue to singular value from all predecessors so far. - if (isFirstPred) { - ExistingValue = PredVal; - isFirstPred = false; - } else if (PredVal != ExistingValue) - ExistingValue = 0; + for (unsigned p = 0; p != Info->NumPreds; ++p) { + BasicBlock *Pred = Preds[p]; + // Check if BBMap already has a BBInfo for the predecessor block. + BBMapTy::value_type &BBMapBucket = BBMap->FindAndConstruct(Pred); + if (BBMapBucket.second) { + Info->Preds[p] = BBMapBucket.second; + continue; + } + + // Create a new BBInfo for the predecessor. + Value *PredVal = AvailableVals.lookup(Pred); + BBInfo *PredInfo = new (*Allocator) BBInfo(Pred, PredVal); + BBMapBucket.second = PredInfo; + Info->Preds[p] = PredInfo; + + if (PredInfo->AvailableVal) { + RootList.push_back(PredInfo); + continue; + } + WorkList.push_back(PredInfo); + } + } + + // Now that we know what blocks are backwards-reachable from the starting + // block, do a forward depth-first traversal to assign postorder numbers + // to those blocks. + BBInfo *PseudoEntry = new (*Allocator) BBInfo(0, 0); + unsigned BlkNum = 1; + + // Initialize the worklist with the roots from the backward traversal. + while (!RootList.empty()) { + Info = RootList.pop_back_val(); + Info->IDom = PseudoEntry; + Info->BlkNum = -1; + WorkList.push_back(Info); + } + + while (!WorkList.empty()) { + Info = WorkList.back(); + + if (Info->BlkNum == -2) { + // All the successors have been handled; assign the postorder number. + Info->BlkNum = BlkNum++; + // If not a root, put it on the BlockList. + if (!Info->AvailableVal) + BlockList->push_back(Info); + WorkList.pop_back(); + continue; + } + + // Leave this entry on the worklist, but set its BlkNum to mark that its + // successors have been put on the worklist. When it returns to the top + // the list, after handling its successors, it will be assigned a number. + Info->BlkNum = -2; + + // Add unvisited successors to the work list. + for (succ_iterator SI = succ_begin(Info->BB), E = succ_end(Info->BB); + SI != E; ++SI) { + BBInfo *SuccInfo = (*BBMap)[*SI]; + if (!SuccInfo || SuccInfo->BlkNum) + continue; + SuccInfo->BlkNum = -1; + WorkList.push_back(SuccInfo); } } + PseudoEntry->BlkNum = BlkNum; +} - // If there are no predecessors, then we must have found an unreachable block - // just return 'undef'. Since there are no predecessors, InsertRes must not - // be invalidated. - if (IncomingPredInfo.size() == FirstPredInfoEntry) - return InsertRes.first->second = UndefValue::get(PrototypeValue->getType()); - - /// Look up BB's entry in AvailableVals. 'InsertRes' may be invalidated. If - /// this block is involved in a loop, a no-entry PHI node will have been - /// inserted as InsertedVal. Otherwise, we'll still have the null we inserted - /// above. - TrackingVH<Value> &InsertedVal = AvailableVals[BB]; - - // If the predecessor values are not all the same, then check to see if there - // is an existing PHI that can be used. - if (!ExistingValue) - ExistingValue = GetExistingPHI(BB, - IncomingPredInfo.begin()+FirstPredInfoEntry, - IncomingPredInfo.end()); - - // If there is an existing value we can use, then we don't need to insert a - // PHI. This is the simple and common case. - if (ExistingValue) { - // If a PHI node got inserted, replace it with the existing value and delete - // it. - if (InsertedVal) { - PHINode *OldVal = cast<PHINode>(InsertedVal); - // Be careful about dead loops. These RAUW's also update InsertedVal. - if (InsertedVal != ExistingValue) - OldVal->replaceAllUsesWith(ExistingValue); - else - OldVal->replaceAllUsesWith(UndefValue::get(InsertedVal->getType())); - OldVal->eraseFromParent(); - } else { - InsertedVal = ExistingValue; +/// IntersectDominators - This is the dataflow lattice "meet" operation for +/// finding dominators. Given two basic blocks, it walks up the dominator +/// tree until it finds a common dominator of both. It uses the postorder +/// number of the blocks to determine how to do that. +static SSAUpdater::BBInfo *IntersectDominators(SSAUpdater::BBInfo *Blk1, + SSAUpdater::BBInfo *Blk2) { + while (Blk1 != Blk2) { + while (Blk1->BlkNum < Blk2->BlkNum) { + Blk1 = Blk1->IDom; + if (!Blk1) + return Blk2; } + while (Blk2->BlkNum < Blk1->BlkNum) { + Blk2 = Blk2->IDom; + if (!Blk2) + return Blk1; + } + } + return Blk1; +} - // Either path through the 'if' should have set InsertedVal -> ExistingVal. - assert((InsertedVal == ExistingValue || isa<UndefValue>(InsertedVal)) && - "RAUW didn't change InsertedVal to be ExistingValue"); +/// FindDominators - Calculate the dominator tree for the subset of the CFG +/// corresponding to the basic blocks on the BlockList. This uses the +/// algorithm from: "A Simple, Fast Dominance Algorithm" by Cooper, Harvey and +/// Kennedy, published in Software--Practice and Experience, 2001, 4:1-10. +/// Because the CFG subset does not include any edges leading into blocks that +/// define the value, the results are not the usual dominator tree. The CFG +/// subset has a single pseudo-entry node with edges to a set of root nodes +/// for blocks that define the value. The dominators for this subset CFG are +/// not the standard dominators but they are adequate for placing PHIs within +/// the subset CFG. +void SSAUpdater::FindDominators(BlockListTy *BlockList) { + bool Changed; + do { + Changed = false; + // Iterate over the list in reverse order, i.e., forward on CFG edges. + for (BlockListTy::reverse_iterator I = BlockList->rbegin(), + E = BlockList->rend(); I != E; ++I) { + BBInfo *Info = *I; + + // Start with the first predecessor. + assert(Info->NumPreds > 0 && "unreachable block"); + BBInfo *NewIDom = Info->Preds[0]; + + // Iterate through the block's other predecessors. + for (unsigned p = 1; p != Info->NumPreds; ++p) { + BBInfo *Pred = Info->Preds[p]; + NewIDom = IntersectDominators(NewIDom, Pred); + } + + // Check if the IDom value has changed. + if (NewIDom != Info->IDom) { + Info->IDom = NewIDom; + Changed = true; + } + } + } while (Changed); +} - // Drop the entries we added in IncomingPredInfo to restore the stack. - IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry, - IncomingPredInfo.end()); - return ExistingValue; +/// IsDefInDomFrontier - Search up the dominator tree from Pred to IDom for +/// any blocks containing definitions of the value. If one is found, then the +/// successor of Pred is in the dominance frontier for the definition, and +/// this function returns true. +static bool IsDefInDomFrontier(const SSAUpdater::BBInfo *Pred, + const SSAUpdater::BBInfo *IDom) { + for (; Pred != IDom; Pred = Pred->IDom) { + if (Pred->DefBB == Pred) + return true; } + return false; +} - // Otherwise, we do need a PHI: insert one now if we don't already have one. - if (InsertedVal == 0) - InsertedVal = PHINode::Create(PrototypeValue->getType(), - PrototypeValue->getName(), &BB->front()); +/// FindPHIPlacement - PHIs are needed in the iterated dominance frontiers of +/// the known definitions. Iteratively add PHIs in the dom frontiers until +/// nothing changes. Along the way, keep track of the nearest dominating +/// definitions for non-PHI blocks. +void SSAUpdater::FindPHIPlacement(BlockListTy *BlockList) { + bool Changed; + do { + Changed = false; + // Iterate over the list in reverse order, i.e., forward on CFG edges. + for (BlockListTy::reverse_iterator I = BlockList->rbegin(), + E = BlockList->rend(); I != E; ++I) { + BBInfo *Info = *I; + + // If this block already needs a PHI, there is nothing to do here. + if (Info->DefBB == Info) + continue; + + // Default to use the same def as the immediate dominator. + BBInfo *NewDefBB = Info->IDom->DefBB; + for (unsigned p = 0; p != Info->NumPreds; ++p) { + if (IsDefInDomFrontier(Info->Preds[p], Info->IDom)) { + // Need a PHI here. + NewDefBB = Info; + break; + } + } + + // Check if anything changed. + if (NewDefBB != Info->DefBB) { + Info->DefBB = NewDefBB; + Changed = true; + } + } + } while (Changed); +} - PHINode *InsertedPHI = cast<PHINode>(InsertedVal); - InsertedPHI->reserveOperandSpace(IncomingPredInfo.size()-FirstPredInfoEntry); +/// FindAvailableVal - If this block requires a PHI, first check if an existing +/// PHI matches the PHI placement and reaching definitions computed earlier, +/// and if not, create a new PHI. Visit all the block's predecessors to +/// calculate the available value for each one and fill in the incoming values +/// for a new PHI. +void SSAUpdater::FindAvailableVals(BlockListTy *BlockList) { + AvailableValsTy &AvailableVals = getAvailableVals(AV); - // Fill in all the predecessors of the PHI. - for (IncomingPredInfoTy::iterator I = - IncomingPredInfo.begin()+FirstPredInfoEntry, - E = IncomingPredInfo.end(); I != E; ++I) - InsertedPHI->addIncoming(I->second, I->first); + // Go through the worklist in forward order (i.e., backward through the CFG) + // and check if existing PHIs can be used. If not, create empty PHIs where + // they are needed. + for (BlockListTy::iterator I = BlockList->begin(), E = BlockList->end(); + I != E; ++I) { + BBInfo *Info = *I; + // Check if there needs to be a PHI in BB. + if (Info->DefBB != Info) + continue; + + // Look for an existing PHI. + FindExistingPHI(Info->BB, BlockList); + if (Info->AvailableVal) + continue; + + PHINode *PHI = PHINode::Create(PrototypeValue->getType(), + PrototypeValue->getName(), + &Info->BB->front()); + PHI->reserveOperandSpace(Info->NumPreds); + Info->AvailableVal = PHI; + AvailableVals[Info->BB] = PHI; + } - // Drop the entries we added in IncomingPredInfo to restore the stack. - IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry, - IncomingPredInfo.end()); + // Now go back through the worklist in reverse order to fill in the arguments + // for any new PHIs added in the forward traversal. + for (BlockListTy::reverse_iterator I = BlockList->rbegin(), + E = BlockList->rend(); I != E; ++I) { + BBInfo *Info = *I; + + if (Info->DefBB != Info) { + // Record the available value at join nodes to speed up subsequent + // uses of this SSAUpdater for the same value. + if (Info->NumPreds > 1) + AvailableVals[Info->BB] = Info->DefBB->AvailableVal; + continue; + } - // See if the PHI node can be merged to a single value. This can happen in - // loop cases when we get a PHI of itself and one other value. - if (Value *ConstVal = InsertedPHI->hasConstantValue()) { - InsertedPHI->replaceAllUsesWith(ConstVal); - InsertedPHI->eraseFromParent(); - InsertedVal = ConstVal; - } else { - DEBUG(dbgs() << " Inserted PHI: " << *InsertedPHI << "\n"); + // Check if this block contains a newly added PHI. + PHINode *PHI = dyn_cast<PHINode>(Info->AvailableVal); + if (!PHI || PHI->getNumIncomingValues() == Info->NumPreds) + continue; + + // Iterate through the block's predecessors. + for (unsigned p = 0; p != Info->NumPreds; ++p) { + BBInfo *PredInfo = Info->Preds[p]; + BasicBlock *Pred = PredInfo->BB; + // Skip to the nearest preceding definition. + if (PredInfo->DefBB != PredInfo) + PredInfo = PredInfo->DefBB; + PHI->addIncoming(PredInfo->AvailableVal, Pred); + } + + DEBUG(dbgs() << " Inserted PHI: " << *PHI << "\n"); // If the client wants to know about all new instructions, tell it. - if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI); + if (InsertedPHIs) InsertedPHIs->push_back(PHI); + } +} + +/// FindExistingPHI - Look through the PHI nodes in a block to see if any of +/// them match what is needed. +void SSAUpdater::FindExistingPHI(BasicBlock *BB, BlockListTy *BlockList) { + PHINode *SomePHI; + for (BasicBlock::iterator It = BB->begin(); + (SomePHI = dyn_cast<PHINode>(It)); ++It) { + if (CheckIfPHIMatches(SomePHI)) { + RecordMatchingPHI(SomePHI); + break; + } + // Match failed: clear all the PHITag values. + for (BlockListTy::iterator I = BlockList->begin(), E = BlockList->end(); + I != E; ++I) + (*I)->PHITag = 0; + } +} + +/// CheckIfPHIMatches - Check if a PHI node matches the placement and values +/// in the BBMap. +bool SSAUpdater::CheckIfPHIMatches(PHINode *PHI) { + BBMapTy *BBMap = getBBMap(BM); + SmallVector<PHINode*, 20> WorkList; + WorkList.push_back(PHI); + + // Mark that the block containing this PHI has been visited. + (*BBMap)[PHI->getParent()]->PHITag = PHI; + + while (!WorkList.empty()) { + PHI = WorkList.pop_back_val(); + + // Iterate through the PHI's incoming values. + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { + Value *IncomingVal = PHI->getIncomingValue(i); + BBInfo *PredInfo = (*BBMap)[PHI->getIncomingBlock(i)]; + // Skip to the nearest preceding definition. + if (PredInfo->DefBB != PredInfo) + PredInfo = PredInfo->DefBB; + + // Check if it matches the expected value. + if (PredInfo->AvailableVal) { + if (IncomingVal == PredInfo->AvailableVal) + continue; + return false; + } + + // Check if the value is a PHI in the correct block. + PHINode *IncomingPHIVal = dyn_cast<PHINode>(IncomingVal); + if (!IncomingPHIVal || IncomingPHIVal->getParent() != PredInfo->BB) + return false; + + // If this block has already been visited, check if this PHI matches. + if (PredInfo->PHITag) { + if (IncomingPHIVal == PredInfo->PHITag) + continue; + return false; + } + PredInfo->PHITag = IncomingPHIVal; + + WorkList.push_back(IncomingPHIVal); + } } + return true; +} - return InsertedVal; +/// RecordMatchingPHI - For a PHI node that matches, record it and its input +/// PHIs in both the BBMap and the AvailableVals mapping. +void SSAUpdater::RecordMatchingPHI(PHINode *PHI) { + BBMapTy *BBMap = getBBMap(BM); + AvailableValsTy &AvailableVals = getAvailableVals(AV); + SmallVector<PHINode*, 20> WorkList; + WorkList.push_back(PHI); + + // Record this PHI. + BasicBlock *BB = PHI->getParent(); + AvailableVals[BB] = PHI; + (*BBMap)[BB]->AvailableVal = PHI; + + while (!WorkList.empty()) { + PHI = WorkList.pop_back_val(); + + // Iterate through the PHI's incoming values. + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { + PHINode *IncomingPHIVal = dyn_cast<PHINode>(PHI->getIncomingValue(i)); + if (!IncomingPHIVal) continue; + BB = IncomingPHIVal->getParent(); + BBInfo *Info = (*BBMap)[BB]; + if (!Info || Info->AvailableVal) + continue; + + // Record the PHI and add it to the worklist. + AvailableVals[BB] = IncomingPHIVal; + Info->AvailableVal = IncomingPHIVal; + WorkList.push_back(IncomingPHIVal); + } + } } diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 60450487cd78..87ce631ca626 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "ValueMapper.h" #include "llvm/Type.h" #include "llvm/Constants.h" #include "llvm/Function.h" @@ -20,7 +20,7 @@ #include "llvm/ADT/SmallVector.h" using namespace llvm; -Value *llvm::MapValue(const Value *V, ValueMapTy &VM) { +Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) { Value *&VMSlot = VM[V]; if (VMSlot) return VMSlot; // Does it exist in the map yet? @@ -127,7 +127,7 @@ Value *llvm::MapValue(const Value *V, ValueMapTy &VM) { /// RemapInstruction - Convert the instruction operands from referencing the /// current values into those specified by ValueMap. /// -void llvm::RemapInstruction(Instruction *I, ValueMapTy &ValueMap) { +void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &ValueMap) { for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) { Value *V = MapValue(*op, ValueMap); assert(V && "Referenced value not in value map!"); diff --git a/lib/Transforms/Utils/ValueMapper.h b/lib/Transforms/Utils/ValueMapper.h new file mode 100644 index 000000000000..d61c24c6a685 --- /dev/null +++ b/lib/Transforms/Utils/ValueMapper.h @@ -0,0 +1,29 @@ +//===- ValueMapper.h - Interface shared by lib/Transforms/Utils -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the MapValue interface which is used by various parts of +// the Transforms/Utils library to implement cloning and linking facilities. +// +//===----------------------------------------------------------------------===// + +#ifndef VALUEMAPPER_H +#define VALUEMAPPER_H + +#include "llvm/ADT/DenseMap.h" + +namespace llvm { + class Value; + class Instruction; + typedef DenseMap<const Value *, Value *> ValueToValueMapTy; + + Value *MapValue(const Value *V, ValueToValueMapTy &VM); + void RemapInstruction(Instruction *I, ValueToValueMapTy &VM); +} // End llvm namespace + +#endif |