diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-17 20:22:39 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-17 20:22:39 +0000 |
commit | 7af96fb3afd6725a2824a0a5ca5dad34e5e0b056 (patch) | |
tree | 6661ffbabf869009597684462f5a3df3beccc952 /lib | |
parent | 6b3f41ed88e8e440e11a4fbf20b6600529f80049 (diff) |
Vendor import of llvm trunk r303291:vendor/llvm/llvm-trunk-r303291
Notes
Notes:
svn path=/vendor/llvm/dist/; revision=318414
svn path=/vendor/llvm/llvm-trunk-r303291/; revision=318415; tag=vendor/llvm/llvm-trunk-r303291
Diffstat (limited to 'lib')
66 files changed, 927 insertions, 912 deletions
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp index a4672efeedd6..e4d58bf1b4eb 100644 --- a/lib/Analysis/DependenceAnalysis.cpp +++ b/lib/Analysis/DependenceAnalysis.cpp @@ -2984,7 +2984,7 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst, SmallVectorImpl<Constraint> &Constraints, bool &Consistent) { bool Result = false; - for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) { + for (unsigned LI : Loops.set_bits()) { DEBUG(dbgs() << "\t Constraint[" << LI << "] is"); DEBUG(Constraints[LI].dump(dbgs())); if (Constraints[LI].isDistance()) @@ -3266,7 +3266,7 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst, // For debugging purposes, dump a small bit vector to dbgs(). static void dumpSmallBitVector(SmallBitVector &BV) { dbgs() << "{"; - for (int VI = BV.find_first(); VI >= 0; VI = BV.find_next(VI)) { + for (unsigned VI : BV.set_bits()) { dbgs() << VI; if (BV.find_next(VI) >= 0) dbgs() << ' '; @@ -3506,7 +3506,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, NewConstraint.setAny(SE); // test separable subscripts - for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) { + for (unsigned SI : Separable.set_bits()) { DEBUG(dbgs() << "testing subscript " << SI); switch (Pair[SI].Classification) { case Subscript::ZIV: @@ -3545,14 +3545,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, SmallVector<Constraint, 4> Constraints(MaxLevels + 1); for (unsigned II = 0; II <= MaxLevels; ++II) Constraints[II].setAny(SE); - for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) { + for (unsigned SI : Coupled.set_bits()) { DEBUG(dbgs() << "testing subscript group " << SI << " { "); SmallBitVector Group(Pair[SI].Group); SmallBitVector Sivs(Pairs); SmallBitVector Mivs(Pairs); SmallBitVector ConstrainedLevels(MaxLevels + 1); SmallVector<Subscript *, 4> PairsInGroup; - for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) { + for (unsigned SJ : Group.set_bits()) { DEBUG(dbgs() << SJ << " "); if (Pair[SJ].Classification == Subscript::SIV) Sivs.set(SJ); @@ -3564,7 +3564,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, DEBUG(dbgs() << "}\n"); while (Sivs.any()) { bool Changed = false; - for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) { + for (unsigned SJ : Sivs.set_bits()) { DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n"); // SJ is an SIV subscript that's part of the current coupled group unsigned Level; @@ -3588,7 +3588,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, DEBUG(dbgs() << " propagating\n"); DEBUG(dbgs() << "\tMivs = "); DEBUG(dumpSmallBitVector(Mivs)); - for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) { + for (unsigned SJ : Mivs.set_bits()) { // SJ is an MIV subscript that's part of the current coupled group DEBUG(dbgs() << "\tSJ = " << SJ << "\n"); if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, @@ -3622,7 +3622,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, } // test & propagate remaining RDIVs - for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) { + for (unsigned SJ : Mivs.set_bits()) { if (Pair[SJ].Classification == Subscript::RDIV) { DEBUG(dbgs() << "RDIV test\n"); if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result)) @@ -3635,7 +3635,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, // test remaining MIVs // This code is temporary. // Better to somehow test all remaining subscripts simultaneously. - for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) { + for (unsigned SJ : Mivs.set_bits()) { if (Pair[SJ].Classification == Subscript::MIV) { DEBUG(dbgs() << "MIV test\n"); if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result)) @@ -3647,9 +3647,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, // update Result.DV from constraint vector DEBUG(dbgs() << " updating\n"); - for (int SJ = ConstrainedLevels.find_first(); SJ >= 0; - SJ = ConstrainedLevels.find_next(SJ)) { - if (SJ > (int)CommonLevels) + for (unsigned SJ : ConstrainedLevels.set_bits()) { + if (SJ > CommonLevels) break; updateDirection(Result.DV[SJ - 1], Constraints[SJ]); if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE) @@ -3859,7 +3858,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, NewConstraint.setAny(SE); // test separable subscripts - for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) { + for (unsigned SI : Separable.set_bits()) { switch (Pair[SI].Classification) { case Subscript::SIV: { unsigned Level; @@ -3886,12 +3885,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, SmallVector<Constraint, 4> Constraints(MaxLevels + 1); for (unsigned II = 0; II <= MaxLevels; ++II) Constraints[II].setAny(SE); - for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) { + for (unsigned SI : Coupled.set_bits()) { SmallBitVector Group(Pair[SI].Group); SmallBitVector Sivs(Pairs); SmallBitVector Mivs(Pairs); SmallBitVector ConstrainedLevels(MaxLevels + 1); - for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) { + for (unsigned SJ : Group.set_bits()) { if (Pair[SJ].Classification == Subscript::SIV) Sivs.set(SJ); else @@ -3899,7 +3898,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, } while (Sivs.any()) { bool Changed = false; - for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) { + for (unsigned SJ : Sivs.set_bits()) { // SJ is an SIV subscript that's part of the current coupled group unsigned Level; const SCEV *SplitIter = nullptr; @@ -3914,7 +3913,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, } if (Changed) { // propagate, possibly creating new SIVs and ZIVs - for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) { + for (unsigned SJ : Mivs.set_bits()) { // SJ is an MIV subscript that's part of the current coupled group if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Constraints, Result.Consistent)) { diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 44c14cb17c22..4702569126c6 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -669,21 +669,33 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { Threshold = MaxIfValid(Threshold, Params.HintThreshold); if (PSI) { BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; - if (PSI->isHotCallSite(CS, CallerBFI)) { - DEBUG(dbgs() << "Hot callsite.\n"); - Threshold = Params.HotCallSiteThreshold.getValue(); - } else if (PSI->isFunctionEntryHot(&Callee)) { - DEBUG(dbgs() << "Hot callee.\n"); - // If callsite hotness can not be determined, we may still know - // that the callee is hot and treat it as a weaker hint for threshold - // increase. - Threshold = MaxIfValid(Threshold, Params.HintThreshold); - } else if (PSI->isColdCallSite(CS, CallerBFI)) { - DEBUG(dbgs() << "Cold callsite.\n"); - Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); - } else if (PSI->isFunctionEntryCold(&Callee)) { - DEBUG(dbgs() << "Cold callee.\n"); - Threshold = MinIfValid(Threshold, Params.ColdThreshold); + // FIXME: After switching to the new passmanager, simplify the logic below + // by checking only the callsite hotness/coldness. The check for CallerBFI + // exists only because we do not have BFI available with the old PM. + // + // Use callee's hotness information only if we have no way of determining + // callsite's hotness information. Callsite hotness can be determined if + // sample profile is used (which adds hotness metadata to calls) or if + // caller's BlockFrequencyInfo is available. + if (CallerBFI || PSI->hasSampleProfile()) { + if (PSI->isHotCallSite(CS, CallerBFI)) { + DEBUG(dbgs() << "Hot callsite.\n"); + Threshold = Params.HotCallSiteThreshold.getValue(); + } else if (PSI->isColdCallSite(CS, CallerBFI)) { + DEBUG(dbgs() << "Cold callsite.\n"); + Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); + } + } else { + if (PSI->isFunctionEntryHot(&Callee)) { + DEBUG(dbgs() << "Hot callee.\n"); + // If callsite hotness can not be determined, we may still know + // that the callee is hot and treat it as a weaker hint for threshold + // increase. + Threshold = MaxIfValid(Threshold, Params.HintThreshold); + } else if (PSI->isFunctionEntryCold(&Callee)) { + DEBUG(dbgs() << "Cold callee.\n"); + Threshold = MinIfValid(Threshold, Params.ColdThreshold); + } } } } diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 5728887cc1e9..5652248a60ce 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -1752,6 +1752,24 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, (A == Op0 || B == Op0)) return Op0; + // A mask that only clears known zeros of a shifted value is a no-op. + Value *X; + const APInt *Mask; + const APInt *ShAmt; + if (match(Op1, m_APInt(Mask))) { + // If all bits in the inverted and shifted mask are clear: + // and (shl X, ShAmt), Mask --> shl X, ShAmt + if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) && + (~(*Mask)).lshr(*ShAmt).isNullValue()) + return Op0; + + // If all bits in the inverted and shifted mask are clear: + // and (lshr X, ShAmt), Mask --> lshr X, ShAmt + if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) && + (~(*Mask)).shl(*ShAmt).isNullValue()) + return Op0; + } + // A & (-A) = A if A is a power of two or zero. if (match(Op0, m_Neg(m_Specific(Op1))) || match(Op1, m_Neg(m_Specific(Op0)))) { diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp index 502f4205b689..12b86daa602b 100644 --- a/lib/Analysis/ProfileSummaryInfo.cpp +++ b/lib/Analysis/ProfileSummaryInfo.cpp @@ -75,7 +75,7 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst, return None; assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) && "We can only get profile count for call/invoke instruction."); - if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) { + if (hasSampleProfile()) { // In sample PGO mode, check if there is a profile metadata on the // instruction. If it is present, determine hotness solely based on that, // since the sampled entry count may not be accurate. diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 800354d2f5b4..a746ddfd7a63 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -629,19 +629,19 @@ static int CompareSCEVComplexity( const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS); const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS); - // If there is a dominance relationship between the loops, sort by the - // dominance. Otherwise, sort by depth. We require such order in getAddExpr. + // There is always a dominance between two recs that are used by one SCEV, + // so we can safely sort recs by loop header dominance. We require such + // order in getAddExpr. const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop(); if (LLoop != RLoop) { const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader(); assert(LHead != RHead && "Two loops share the same header?"); if (DT.dominates(LHead, RHead)) return 1; - else if (DT.dominates(RHead, LHead)) - return -1; - unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth(); - if (LDepth != RDepth) - return (int)LDepth - (int)RDepth; + else + assert(DT.dominates(RHead, LHead) && + "No dominance between recurrences used by one SCEV?"); + return -1; } // Addrec complexity grows with operand count. @@ -2512,22 +2512,23 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(), AddRec->op_end()); for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]); - ++OtherIdx) - if (const auto *OtherAddRec = dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx])) - if (OtherAddRec->getLoop() == AddRecLoop) { - for (unsigned i = 0, e = OtherAddRec->getNumOperands(); - i != e; ++i) { - if (i >= AddRecOps.size()) { - AddRecOps.append(OtherAddRec->op_begin()+i, - OtherAddRec->op_end()); - break; - } - SmallVector<const SCEV *, 2> TwoOps = { - AddRecOps[i], OtherAddRec->getOperand(i)}; - AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1); + ++OtherIdx) { + const auto *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]); + if (OtherAddRec->getLoop() == AddRecLoop) { + for (unsigned i = 0, e = OtherAddRec->getNumOperands(); + i != e; ++i) { + if (i >= AddRecOps.size()) { + AddRecOps.append(OtherAddRec->op_begin()+i, + OtherAddRec->op_end()); + break; } - Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; + SmallVector<const SCEV *, 2> TwoOps = { + AddRecOps[i], OtherAddRec->getOperand(i)}; + AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1); } + Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; + } + } // Step size has changed, so we cannot guarantee no self-wraparound. Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap); return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 3a57772cc7f5..43b245c66400 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -128,8 +128,7 @@ AggressiveAntiDepBreaker::AggressiveAntiDepBreaker( } DEBUG(dbgs() << "AntiDep Critical-Path Registers:"); - DEBUG(for (int r = CriticalPathSet.find_first(); r != -1; - r = CriticalPathSet.find_next(r)) + DEBUG(for (unsigned r : CriticalPathSet.set_bits()) dbgs() << " " << TRI->getName(r)); DEBUG(dbgs() << '\n'); } @@ -571,7 +570,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( DEBUG({ dbgs() << " ::"; - for (int r = BV.find_first(); r != -1; r = BV.find_next(r)) + for (unsigned r : BV.set_bits()) dbgs() << " " << TRI->getName(r); dbgs() << "\n"; }); diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 98163bffb60b..7d945690e9c3 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -501,7 +501,7 @@ void CodeViewDebug::emitTypeInformation() { Error E = Reader.readArray(Types, Reader.getLength()); if (!E) { TypeVisitorCallbacks C; - E = CVTypeVisitor(C).visitTypeStream(Types); + E = codeview::visitTypeStream(Types, C); } if (E) { logAllUnhandledErrors(std::move(E), errs(), "error: "); diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp index 22fd7bb46056..20e1467b30c3 100644 --- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp +++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp @@ -209,8 +209,7 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF, } else if (MO.isRegMask()) { // If this is a register mask operand, clobber all debug values in // non-CSRs. - for (int I = ChangingRegs.find_first(); I != -1; - I = ChangingRegs.find_next(I)) { + for (unsigned I : ChangingRegs.set_bits()) { // Don't consider SP to be clobbered by register masks. if (unsigned(I) != SP && TRI->isPhysicalRegister(I) && MO.clobbersPhysReg(I)) { diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 811858f136eb..77dfb13ac1f2 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1129,6 +1129,11 @@ void IRTranslator::finalizeFunction() { ValToVReg.clear(); FrameIndices.clear(); MachinePreds.clear(); + // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it + // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid + // destroying it twice (in ~IRTranslator() and ~LLVMContext()) + EntryBuilder = MachineIRBuilder(); + CurBuilder = MachineIRBuilder(); } bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index ab433273b189..b53b002f55a6 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -760,7 +760,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { const MachineFrameInfo &MFI = MF->getFrameInfo(); BitVector PR = MFI.getPristineRegs(*MF); - for (int I = PR.find_first(); I>0; I = PR.find_next(I)) { + for (unsigned I : PR.set_bits()) { for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) regsLive.insert(*SubRegs); diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 06500289c971..47d726f6da7a 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -285,8 +285,7 @@ class RAGreedy : public MachineFunctionPass, // Set B[i] = C for every live bundle where B[i] was NoCand. unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) { unsigned Count = 0; - for (int i = LiveBundles.find_first(); i >= 0; - i = LiveBundles.find_next(i)) + for (unsigned i : LiveBundles.set_bits()) if (B[i] == NoCand) { B[i] = C; Count++; @@ -1162,9 +1161,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) { } DEBUG({ - for (int i = Cand.LiveBundles.find_first(); i>=0; - i = Cand.LiveBundles.find_next(i)) - dbgs() << " EB#" << i; + for (int i : Cand.LiveBundles.set_bits()) + dbgs() << " EB#" << i; dbgs() << ".\n"; }); return true; @@ -1482,8 +1480,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg, DEBUG({ dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost) << " with bundles"; - for (int i = Cand.LiveBundles.find_first(); i>=0; - i = Cand.LiveBundles.find_next(i)) + for (int i : Cand.LiveBundles.set_bits()) dbgs() << " EB#" << i; dbgs() << ".\n"; }); diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index caf5cb497a71..0ccee175abfb 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13087,14 +13087,28 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } - // If this is a store followed by a store with the same value to the same - // location, then the store is dead/noop. if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { - if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() && - ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() && - ST1->isUnindexed() && !ST1->isVolatile()) { - // The store is dead, remove it. - return Chain; + if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && + !ST1->isVolatile() && ST1->getBasePtr() == Ptr && + ST->getMemoryVT() == ST1->getMemoryVT()) { + // If this is a store followed by a store with the same value to the same + // location, then the store is dead/noop. + if (ST1->getValue() == Value) { + // The store is dead, remove it. + return Chain; + } + + // If this is a store who's preceeding store to the same location + // and no one other node is chained to that store we can effectively + // drop the store. Do not remove stores to undef as they may be used as + // data sinks. + if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && + !ST1->getBasePtr().isUndef()) { + // ST1 is fully overwritten and can be elided. Combine with it's chain + // value. + CombineTo(ST1, ST1->getChain()); + return SDValue(); + } } } diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp index f10c98ef4e50..43cbf4add0f8 100644 --- a/lib/CodeGen/SpillPlacement.cpp +++ b/lib/CodeGen/SpillPlacement.cpp @@ -310,7 +310,7 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) { bool SpillPlacement::scanActiveBundles() { RecentPositive.clear(); - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) { + for (unsigned n : ActiveNodes->set_bits()) { update(n); // A node that must spill, or a node without any links is not going to // change its value ever again, so exclude it from iterations. @@ -365,7 +365,7 @@ SpillPlacement::finish() { // Write preferences back to ActiveNodes. bool Perfect = true; - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) + for (unsigned n : ActiveNodes->set_bits()) if (!nodes[n].preferReg()) { ActiveNodes->reset(n); Perfect = false; diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp index f51d959a089a..86a16187fcb6 100644 --- a/lib/CodeGen/StackColoring.cpp +++ b/lib/CodeGen/StackColoring.cpp @@ -703,12 +703,10 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) { // Create the interval of the blocks that we previously found to be 'alive'. BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB]; - for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1; - pos = MBBLiveness.LiveIn.find_next(pos)) { + for (unsigned pos : MBBLiveness.LiveIn.set_bits()) { Starts[pos] = Indexes->getMBBStartIdx(&MBB); } - for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1; - pos = MBBLiveness.LiveOut.find_next(pos)) { + for (unsigned pos : MBBLiveness.LiveOut.set_bits()) { Finishes[pos] = Indexes->getMBBEndIdx(&MBB); } diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 39aa946fa840..5f63fd4320bb 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -1312,7 +1312,7 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI, // Find the first legal register class with the largest spill size. const TargetRegisterClass *BestRC = RC; - for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) { + for (unsigned i : SuperRegRC.set_bits()) { const TargetRegisterClass *SuperRC = TRI->getRegClass(i); // We want the largest possible spill size. if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC)) diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index e6c5d8753b83..9724cb074584 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -564,6 +564,14 @@ void TargetPassConfig::addISelPrepare() { addPass(createVerifierPass()); } +/// -regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } +static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, + RegisterPassParser<RegisterRegAlloc> > +RegAlloc("regalloc", + cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use")); + /// Add the complete set of target-independent postISel code generator passes. /// /// This can be read as the standard order of major LLVM CodeGen stages. Stages @@ -625,8 +633,12 @@ void TargetPassConfig::addMachinePasses() { // including phi elimination and scheduling. if (getOptimizeRegAlloc()) addOptimizedRegAlloc(createRegAllocPass(true)); - else + else { + if (RegAlloc != &useDefaultRegisterAllocator && + RegAlloc != &createFastRegisterAllocator) + report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addFastRegAlloc(createRegAllocPass(false)); + } // Run post-ra passes. addPostRegAlloc(); @@ -759,19 +771,12 @@ MachinePassRegistry RegisterRegAlloc::Registry; /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. static llvm::once_flag InitializeDefaultRegisterAllocatorFlag; -static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } + static RegisterRegAlloc defaultRegAlloc("default", "pick register allocator based on -O option", useDefaultRegisterAllocator); -/// -regalloc=... command line option. -static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, - RegisterPassParser<RegisterRegAlloc> > -RegAlloc("regalloc", - cl::init(&useDefaultRegisterAllocator), - cl::desc("Register allocator to use")); - static void initializeDefaultRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault(); @@ -781,7 +786,6 @@ static void initializeDefaultRegisterAllocatorOnce() { } } - /// Instantiate the default register allocator pass for this target for either /// the optimized or unoptimized allocation path. This will be added to the pass /// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index f6e4c17d514c..41ec082a24cf 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -50,8 +50,7 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet, ArrayRef<MCPhysReg> Exceptions) const { // Check that all super registers of reserved regs are reserved as well. BitVector Checked(getNumRegs()); - for (int Reg = RegisterSet.find_first(); Reg>=0; - Reg = RegisterSet.find_next(Reg)) { + for (unsigned Reg : RegisterSet.set_bits()) { if (Checked[Reg]) continue; for (MCSuperRegIterator SR(Reg, this); SR.isValid(); ++SR) { diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp index bcc8218d9446..02e1682f76e7 100644 --- a/lib/DebugInfo/CodeView/CVTypeDumper.cpp +++ b/lib/DebugInfo/CodeView/CVTypeDumper.cpp @@ -11,7 +11,6 @@ #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" #include "llvm/DebugInfo/CodeView/TypeDatabase.h" #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h" -#include "llvm/DebugInfo/CodeView/TypeDeserializer.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h" #include "llvm/Support/BinaryByteStream.h" @@ -21,38 +20,23 @@ using namespace llvm::codeview; Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) { TypeDatabaseVisitor DBV(TypeDB); - TypeDeserializer Deserializer; TypeVisitorCallbackPipeline Pipeline; - Pipeline.addCallbackToPipeline(Deserializer); Pipeline.addCallbackToPipeline(DBV); Pipeline.addCallbackToPipeline(Dumper); - CVTypeVisitor Visitor(Pipeline); - if (Handler) - Visitor.addTypeServerHandler(*Handler); - CVType RecordCopy = Record; - if (auto EC = Visitor.visitTypeRecord(RecordCopy)) - return EC; - return Error::success(); + return codeview::visitTypeRecord(RecordCopy, Pipeline, VDS_BytesPresent, + Handler); } Error CVTypeDumper::dump(const CVTypeArray &Types, TypeVisitorCallbacks &Dumper) { TypeDatabaseVisitor DBV(TypeDB); - TypeDeserializer Deserializer; TypeVisitorCallbackPipeline Pipeline; - Pipeline.addCallbackToPipeline(Deserializer); Pipeline.addCallbackToPipeline(DBV); Pipeline.addCallbackToPipeline(Dumper); - CVTypeVisitor Visitor(Pipeline); - if (Handler) - Visitor.addTypeServerHandler(*Handler); - - if (auto EC = Visitor.visitTypeStream(Types)) - return EC; - return Error::success(); + return codeview::visitTypeStream(Types, Pipeline, Handler); } Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) { diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp index b6ed0453d9c4..0f7f5f667790 100644 --- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp @@ -59,13 +59,8 @@ static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) { }; TypeServer2Record R(TypeRecordKind::TypeServer2); - TypeDeserializer Deserializer; StealTypeServerVisitor Thief(R); - TypeVisitorCallbackPipeline Pipeline; - Pipeline.addCallbackToPipeline(Deserializer); - Pipeline.addCallbackToPipeline(Thief); - CVTypeVisitor Visitor(Pipeline); - if (auto EC = Visitor.visitTypeRecord(Record)) + if (auto EC = visitTypeRecord(Record, Thief)) return std::move(EC); return R; @@ -178,7 +173,7 @@ static Error visitMemberRecord(CVMemberRecord &Record, return Error::success(); } -Error CVTypeVisitor::visitMemberRecord(CVMemberRecord &Record) { +Error CVTypeVisitor::visitMemberRecord(CVMemberRecord Record) { return ::visitMemberRecord(Record, Callbacks); } @@ -224,3 +219,93 @@ Error CVTypeVisitor::visitFieldListMemberStream(ArrayRef<uint8_t> Data) { BinaryStreamReader SR(S); return visitFieldListMemberStream(SR); } + +namespace { +struct FieldListVisitHelper { + FieldListVisitHelper(TypeVisitorCallbacks &Callbacks, ArrayRef<uint8_t> Data, + VisitorDataSource Source) + : Stream(Data, llvm::support::little), Reader(Stream), + Deserializer(Reader), + Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) { + if (Source == VDS_BytesPresent) { + Pipeline.addCallbackToPipeline(Deserializer); + Pipeline.addCallbackToPipeline(Callbacks); + } + } + + BinaryByteStream Stream; + BinaryStreamReader Reader; + FieldListDeserializer Deserializer; + TypeVisitorCallbackPipeline Pipeline; + CVTypeVisitor Visitor; +}; + +struct VisitHelper { + VisitHelper(TypeVisitorCallbacks &Callbacks, VisitorDataSource Source, + TypeServerHandler *TS) + : Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) { + if (TS) + Visitor.addTypeServerHandler(*TS); + if (Source == VDS_BytesPresent) { + Pipeline.addCallbackToPipeline(Deserializer); + Pipeline.addCallbackToPipeline(Callbacks); + } + } + + TypeDeserializer Deserializer; + TypeVisitorCallbackPipeline Pipeline; + CVTypeVisitor Visitor; +}; +} + +Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index, + TypeVisitorCallbacks &Callbacks, + VisitorDataSource Source, + TypeServerHandler *TS) { + VisitHelper Helper(Callbacks, Source, TS); + return Helper.Visitor.visitTypeRecord(Record, Index); +} + +Error llvm::codeview::visitTypeRecord(CVType &Record, + TypeVisitorCallbacks &Callbacks, + VisitorDataSource Source, + TypeServerHandler *TS) { + VisitHelper Helper(Callbacks, Source, TS); + return Helper.Visitor.visitTypeRecord(Record); +} + +Error llvm::codeview::visitMemberRecordStream(ArrayRef<uint8_t> FieldList, + TypeVisitorCallbacks &Callbacks) { + CVTypeVisitor Visitor(Callbacks); + return Visitor.visitFieldListMemberStream(FieldList); +} + +Error llvm::codeview::visitMemberRecord(CVMemberRecord Record, + TypeVisitorCallbacks &Callbacks, + VisitorDataSource Source) { + FieldListVisitHelper Helper(Callbacks, Record.Data, Source); + return Helper.Visitor.visitMemberRecord(Record); +} + +Error llvm::codeview::visitMemberRecord(TypeLeafKind Kind, + ArrayRef<uint8_t> Record, + TypeVisitorCallbacks &Callbacks) { + CVMemberRecord R; + R.Data = Record; + R.Kind = Kind; + return visitMemberRecord(R, Callbacks, VDS_BytesPresent); +} + +Error llvm::codeview::visitTypeStream(const CVTypeArray &Types, + TypeVisitorCallbacks &Callbacks, + TypeServerHandler *TS) { + VisitHelper Helper(Callbacks, VDS_BytesPresent, TS); + return Helper.Visitor.visitTypeStream(Types); +} + +Error llvm::codeview::visitTypeStream(CVTypeRange Types, + TypeVisitorCallbacks &Callbacks, + TypeServerHandler *TS) { + VisitHelper Helper(Callbacks, VDS_BytesPresent, TS); + return Helper.Visitor.visitTypeStream(Types); +} diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp index 4cb9acbe07d9..704d1131108a 100644 --- a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp @@ -9,6 +9,7 @@ #include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" #include "llvm/DebugInfo/CodeView/TypeDatabase.h" #include "llvm/DebugInfo/CodeView/TypeServerHandler.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" @@ -20,9 +21,7 @@ RandomAccessTypeVisitor::RandomAccessTypeVisitor( const CVTypeArray &Types, uint32_t NumRecords, PartialOffsetArray PartialOffsets) : Database(NumRecords), Types(Types), DatabaseVisitor(Database), - InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) { - Pipeline.addCallbackToPipeline(Deserializer); - Pipeline.addCallbackToPipeline(DatabaseVisitor); + PartialOffsets(PartialOffsets) { KnownOffsets.resize(Database.capacity()); } @@ -38,8 +37,7 @@ Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI, assert(Database.contains(TI)); auto &Record = Database.getTypeRecord(TI); - CVTypeVisitor V(Callbacks); - return V.visitTypeRecord(Record, TI); + return codeview::visitTypeRecord(Record, TI, Callbacks); } Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) { @@ -78,7 +76,7 @@ Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset, while (Begin != End) { assert(!Database.contains(Begin)); - if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin)) + if (auto EC = codeview::visitTypeRecord(*RI, Begin, DatabaseVisitor)) return EC; KnownOffsets[Begin.toArrayIndex()] = BeginOffset; diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp index 27a6e0987886..9485c9cfedff 100644 --- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp @@ -216,8 +216,7 @@ Error TypeDumpVisitor::visitMemberEnd(CVMemberRecord &Record) { Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FieldListRecord &FieldList) { - CVTypeVisitor Visitor(*this); - if (auto EC = Visitor.visitFieldListMemberStream(FieldList.Data)) + if (auto EC = codeview::visitMemberRecordStream(FieldList.Data, *this)) return EC; return Error::success(); diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp index aad20ae6dda1..51f24fa3f135 100644 --- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp +++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp @@ -361,8 +361,7 @@ Error TypeStreamMerger::visitKnownRecord(CVType &, FieldListRecord &R) { // Visit the members inside the field list. HadUntranslatedMember = false; FieldListBuilder.begin(); - CVTypeVisitor Visitor(*this); - if (auto EC = Visitor.visitFieldListMemberStream(R.Data)) + if (auto EC = codeview::visitMemberRecordStream(R.Data, *this)) return EC; // Write the record if we translated all field list members. @@ -440,18 +439,9 @@ Error TypeStreamMerger::visitUnknownType(CVType &Rec) { Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) { assert(IndexMap.empty()); - TypeVisitorCallbackPipeline Pipeline; LastError = Error::success(); - TypeDeserializer Deserializer; - Pipeline.addCallbackToPipeline(Deserializer); - Pipeline.addCallbackToPipeline(*this); - - CVTypeVisitor Visitor(Pipeline); - if (Handler) - Visitor.addTypeServerHandler(*Handler); - - if (auto EC = Visitor.visitTypeStream(Types)) + if (auto EC = codeview::visitTypeStream(Types, *this, Handler)) return EC; // If we found bad indices but no other errors, try doing another pass and see @@ -466,7 +456,8 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) { IsSecondPass = true; NumBadIndices = 0; CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex); - if (auto EC = Visitor.visitTypeStream(Types)) + + if (auto EC = codeview::visitTypeStream(Types, *this, Handler)) return EC; assert(NumBadIndices <= BadIndicesRemaining && diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 59a060d143ff..61e75a2b56ab 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -1086,49 +1086,32 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, continue; } + if (Section.relocation_begin() == Section.relocation_end()) + continue; + std::map<SymbolRef, uint64_t> AddrCache; - if (Section.relocation_begin() != Section.relocation_end()) { - uint64_t SectionSize = RelocatedSection->getSize(); - for (const RelocationRef &Reloc : Section.relocations()) { - // FIXME: it's not clear how to correctly handle scattered - // relocations. - if (isRelocScattered(Obj, Reloc)) - continue; + for (const RelocationRef &Reloc : Section.relocations()) { + // FIXME: it's not clear how to correctly handle scattered + // relocations. + if (isRelocScattered(Obj, Reloc)) + continue; - Expected<uint64_t> SymAddrOrErr = - getSymbolAddress(Obj, Reloc, L, AddrCache); - if (!SymAddrOrErr) { - errs() << toString(SymAddrOrErr.takeError()) << '\n'; - continue; - } + Expected<uint64_t> SymAddrOrErr = + getSymbolAddress(Obj, Reloc, L, AddrCache); + if (!SymAddrOrErr) { + errs() << toString(SymAddrOrErr.takeError()) << '\n'; + continue; + } - object::RelocVisitor V(Obj); - object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr)); - if (V.error()) { - SmallString<32> Name; - Reloc.getTypeName(Name); - errs() << "error: failed to compute relocation: " - << Name << "\n"; - continue; - } - uint64_t Address = Reloc.getOffset(); - if (Address + R.Width > SectionSize) { - errs() << "error: " << R.Width << "-byte relocation starting " - << Address << " bytes into section " << name << " which is " - << SectionSize << " bytes long.\n"; - continue; - } - if (R.Width > 8) { - errs() << "error: can't handle a relocation of more than 8 bytes at " - "a time.\n"; - continue; - } - DEBUG(dbgs() << "Writing " << format("%p", R.Value) - << " at " << format("%p", Address) - << " with width " << format("%d", R.Width) - << "\n"); - Map->insert({Address, {(uint8_t)R.Width, R.Value}}); + object::RelocVisitor V(Obj); + object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr)); + if (V.error()) { + SmallString<32> Name; + Reloc.getTypeName(Name); + errs() << "error: failed to compute relocation: " << Name << "\n"; + continue; } + Map->insert({Reloc.getOffset(), {R.Value}}); } } } diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp index 629f3e80b0ed..cb783cf4fea7 100644 --- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp +++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp @@ -55,9 +55,8 @@ PDBTypeServerHandler::handleInternal(PDBFile &File, auto ExpectedTpi = File.getPDBTpiStream(); if (!ExpectedTpi) return ExpectedTpi.takeError(); - CVTypeVisitor Visitor(Callbacks); - if (auto EC = Visitor.visitTypeStream(ExpectedTpi->types(nullptr))) + if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks)) return std::move(EC); return true; diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index e9a4b71c903d..ab86e5d6a0fd 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -705,7 +705,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj, unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL; unsigned PaddingSize = 0; unsigned StubBufSize = 0; - bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections; + bool IsRequired = isRequiredForExecution(Section); bool IsVirtual = Section.isVirtual(); bool IsZeroInit = isZeroInit(Section); bool IsReadOnly = isReadOnlyData(Section); @@ -745,8 +745,8 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj, Alignment = std::max(Alignment, getStubAlignment()); // Some sections, such as debug info, don't need to be loaded for execution. - // Leave those where they are. - if (IsRequired) { + // Process those only if explicitly requested. + if (IsRequired || ProcessAllSections) { Allocate = DataSize + PaddingSize + StubBufSize; if (!Allocate) Allocate = 1; @@ -790,6 +790,10 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj, Sections.push_back( SectionEntry(Name, Addr, DataSize, Allocate, (uintptr_t)pData)); + // Debug info sections are linked as if their load address was zero + if (!IsRequired) + Sections.back().setLoadAddress(0); + if (Checker) Checker->registerSection(Obj.getFileName(), SectionID); diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp index 98865f5e065e..bd38dd88201f 100644 --- a/lib/Support/CrashRecoveryContext.cpp +++ b/lib/Support/CrashRecoveryContext.cpp @@ -78,6 +78,9 @@ static bool gCrashRecoveryEnabled = false; static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>> tlIsRecoveringFromCrash; +static void installExceptionOrSignalHandlers(); +static void uninstallExceptionOrSignalHandlers(); + CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {} CrashRecoveryContext::~CrashRecoveryContext() { @@ -113,6 +116,23 @@ CrashRecoveryContext *CrashRecoveryContext::GetCurrent() { return CRCI->CRC; } +void CrashRecoveryContext::Enable() { + sys::ScopedLock L(*gCrashRecoveryContextMutex); + // FIXME: Shouldn't this be a refcount or something? + if (gCrashRecoveryEnabled) + return; + gCrashRecoveryEnabled = true; + installExceptionOrSignalHandlers(); +} + +void CrashRecoveryContext::Disable() { + sys::ScopedLock L(*gCrashRecoveryContextMutex); + if (!gCrashRecoveryEnabled) + return; + gCrashRecoveryEnabled = false; + uninstallExceptionOrSignalHandlers(); +} + void CrashRecoveryContext::registerCleanup(CrashRecoveryContextCleanup *cleanup) { if (!cleanup) @@ -140,30 +160,70 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) { delete cleanup; } -#ifdef LLVM_ON_WIN32 +#if defined(_MSC_VER) +// If _MSC_VER is defined, we must have SEH. Use it if it's available. It's way +// better than VEH. Vectored exception handling catches all exceptions happening +// on the thread with installed exception handlers, so it can interfere with +// internal exception handling of other libraries on that thread. SEH works +// exactly as you would expect normal exception handling to work: it only +// catches exceptions if they would bubble out from the stack frame with __try / +// __except. -#include "Windows/WindowsSupport.h" +static void installExceptionOrSignalHandlers() {} +static void uninstallExceptionOrSignalHandlers() {} -// On Windows, we can make use of vectored exception handling to -// catch most crashing situations. Note that this does mean -// we will be alerted of exceptions *before* structured exception -// handling has the opportunity to catch it. But that isn't likely -// to cause problems because nowhere in the project is SEH being -// used. +bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) { + if (!gCrashRecoveryEnabled) { + Fn(); + return true; + } + + bool Result = true; + __try { + Fn(); + } __except (1) { // Catch any exception. + Result = false; + } + return Result; +} + +#else // !_MSC_VER + +#if defined(LLVM_ON_WIN32) +// This is a non-MSVC compiler, probably mingw gcc or clang without +// -fms-extensions. Use vectored exception handling (VEH). +// +// On Windows, we can make use of vectored exception handling to catch most +// crashing situations. Note that this does mean we will be alerted of +// exceptions *before* structured exception handling has the opportunity to +// catch it. Unfortunately, this causes problems in practice with other code +// running on threads with LLVM crash recovery contexts, so we would like to +// eventually move away from VEH. // -// Vectored exception handling is built on top of SEH, and so it -// works on a per-thread basis. +// Vectored works on a per-thread basis, which is an advantage over +// SetUnhandledExceptionFilter. SetUnhandledExceptionFilter also doesn't have +// any native support for chaining exception handlers, but VEH allows more than +// one. // // The vectored exception handler functionality was added in Windows // XP, so if support for older versions of Windows is required, // it will have to be added. -// -// If we want to support as far back as Win2k, we could use the -// SetUnhandledExceptionFilter API, but there's a risk of that -// being entirely overwritten (it's not a chain). + +#include "Windows/WindowsSupport.h" static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) { + // DBG_PRINTEXCEPTION_WIDE_C is not properly defined on all supported + // compilers and platforms, so we define it manually. + constexpr ULONG DbgPrintExceptionWideC = 0x4001000AL; + switch (ExceptionInfo->ExceptionRecord->ExceptionCode) + { + case DBG_PRINTEXCEPTION_C: + case DbgPrintExceptionWideC: + case 0x406D1388: // set debugger thread name + return EXCEPTION_CONTINUE_EXECUTION; + } + // Lookup the current thread local recovery object. const CrashRecoveryContextImpl *CRCI = CurrentContext->get(); @@ -192,14 +252,7 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) // non-NULL, valid VEH handles, or NULL. static sys::ThreadLocal<const void> sCurrentExceptionHandle; -void CrashRecoveryContext::Enable() { - sys::ScopedLock L(*gCrashRecoveryContextMutex); - - if (gCrashRecoveryEnabled) - return; - - gCrashRecoveryEnabled = true; - +static void installExceptionOrSignalHandlers() { // We can set up vectored exception handling now. We will install our // handler as the front of the list, though there's no assurances that // it will remain at the front (another call could install itself before @@ -208,14 +261,7 @@ void CrashRecoveryContext::Enable() { sCurrentExceptionHandle.set(handle); } -void CrashRecoveryContext::Disable() { - sys::ScopedLock L(*gCrashRecoveryContextMutex); - - if (!gCrashRecoveryEnabled) - return; - - gCrashRecoveryEnabled = false; - +static void uninstallExceptionOrSignalHandlers() { PVOID currentHandle = const_cast<PVOID>(sCurrentExceptionHandle.get()); if (currentHandle) { // Now we can remove the vectored exception handler from the chain @@ -226,7 +272,7 @@ void CrashRecoveryContext::Disable() { } } -#else +#else // !LLVM_ON_WIN32 // Generic POSIX implementation. // @@ -278,14 +324,7 @@ static void CrashRecoverySignalHandler(int Signal) { const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash(); } -void CrashRecoveryContext::Enable() { - sys::ScopedLock L(*gCrashRecoveryContextMutex); - - if (gCrashRecoveryEnabled) - return; - - gCrashRecoveryEnabled = true; - +static void installExceptionOrSignalHandlers() { // Setup the signal handler. struct sigaction Handler; Handler.sa_handler = CrashRecoverySignalHandler; @@ -297,20 +336,13 @@ void CrashRecoveryContext::Enable() { } } -void CrashRecoveryContext::Disable() { - sys::ScopedLock L(*gCrashRecoveryContextMutex); - - if (!gCrashRecoveryEnabled) - return; - - gCrashRecoveryEnabled = false; - +static void uninstallExceptionOrSignalHandlers() { // Restore the previous signal handlers. for (unsigned i = 0; i != NumSignals; ++i) sigaction(Signals[i], &PrevActions[i], nullptr); } -#endif +#endif // !LLVM_ON_WIN32 bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) { // If crash recovery is disabled, do nothing. @@ -328,6 +360,8 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) { return true; } +#endif // !_MSC_VER + void CrashRecoveryContext::HandleCrash() { CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl; assert(CRCI && "Crash recovery context never initialized!"); diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index cdea09be41e0..fa28ba1b6ab6 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -103,16 +103,13 @@ #define STATVFS_F_FLAG(vfs) (vfs).f_flags #endif -#if defined(__FreeBSD__) || defined(__NetBSD__) -#include <sys/sysctl.h> -#endif - using namespace llvm; namespace llvm { namespace sys { namespace fs { -#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \ +#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \ + defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \ defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \ defined(_AIX) static int @@ -167,7 +164,7 @@ getprogpath(char ret[PATH_MAX], const char *bin) free(pv); return nullptr; } -#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX +#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__ /// GetMainExecutable - Return the path to the main executable, given the /// value of argv[0] from program startup. @@ -183,24 +180,9 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) { if (realpath(exe_path, link_path)) return link_path; } -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) - int mib[4]; - mib[0] = CTL_KERN; -#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PATHNAME; - mib[3] = -1; -#else - mib[1] = KERN_PROC_ARGS; - mib[2] = -1; - mib[3] = KERN_PROC_PATHNAME; -#endif - char exe_path[PATH_MAX]; - size_t cb = sizeof(exe_path); - if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0) - return exe_path; -#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \ - defined(__DragonFly__) || defined(_AIX) +#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \ + defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \ + defined(__FreeBSD_kernel__) || defined(_AIX) char exe_path[PATH_MAX]; if (getprogpath(exe_path, argv0) != NULL) diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index dc916c034661..1aec602a2a36 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1158,8 +1158,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; - for (int Reg = SavedRegs.find_first(); Reg != -1; - Reg = SavedRegs.find_next(Reg)) + for (unsigned Reg : SavedRegs.set_bits()) dbgs() << ' ' << PrintReg(Reg, RegInfo); dbgs() << "\n";); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4f7c2e122390..1af36086ad90 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -553,7 +553,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -659,6 +658,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Vector reductions + for (MVT VT : MVT::integer_valuetypes()) { + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } + for (MVT VT : MVT::fp_valuetypes()) { + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + } + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled @@ -2606,6 +2618,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + return LowerVECREDUCE(Op, DAG); } } @@ -7128,6 +7148,47 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return Cmp; } +static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, + SelectionDAG &DAG) { + SDValue VecOp = ScalarOp.getOperand(0); + auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, + DAG.getConstant(0, DL, MVT::i64)); +} + +SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + switch (Op.getOpcode()) { + case ISD::VECREDUCE_ADD: + return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); + case ISD::VECREDUCE_SMAX: + return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); + case ISD::VECREDUCE_SMIN: + return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); + case ISD::VECREDUCE_UMAX: + return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); + case ISD::VECREDUCE_UMIN: + return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); + case ISD::VECREDUCE_FMAX: { + assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), + Op.getOperand(0)); + } + case ISD::VECREDUCE_FMIN: { + assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), + Op.getOperand(0)); + } + default: + llvm_unreachable("Unhandled reduction"); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -9490,266 +9551,6 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } -/// This function handles the log2-shuffle pattern produced by the -/// LoopVectorizer for the across vector reduction. It consists of -/// log2(NumVectorElements) steps and, in each step, 2^(s) elements -/// are reduced, where s is an induction variable from 0 to -/// log2(NumVectorElements). -static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, - unsigned Op, - SelectionDAG &DAG) { - EVT VTy = OpV->getOperand(0).getValueType(); - if (!VTy.isVector()) - return SDValue(); - - int NumVecElts = VTy.getVectorNumElements(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (NumVecElts != 4) - return SDValue(); - } else { - if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) - return SDValue(); - } - - int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); - SDValue PreOp = OpV; - // Iterate over each step of the across vector reduction. - for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { - SDValue CurOp = PreOp.getOperand(0); - SDValue Shuffle = PreOp.getOperand(1); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { - // Try to swap the 1st and 2nd operand as add and min/max instructions - // are commutative. - CurOp = PreOp.getOperand(1); - Shuffle = PreOp.getOperand(0); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - } - - // Check if the input vector is fed by the operator we want to handle, - // except the last step; the very first input vector is not necessarily - // the same operator we are handling. - if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) - return SDValue(); - - // Check if it forms one step of the across vector reduction. - // E.g., - // %cur = add %1, %0 - // %shuffle = vector_shuffle %cur, <2, 3, u, u> - // %pre = add %cur, %shuffle - if (Shuffle.getOperand(0) != CurOp) - return SDValue(); - - int NumMaskElts = 1 << CurStep; - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); - // Check mask values in each step. - // We expect the shuffle mask in each step follows a specific pattern - // denoted here by the <M, U> form, where M is a sequence of integers - // starting from NumMaskElts, increasing by 1, and the number integers - // in M should be NumMaskElts. U is a sequence of UNDEFs and the number - // of undef in U should be NumVecElts - NumMaskElts. - // E.g., for <8 x i16>, mask values in each step should be : - // step 0 : <1,u,u,u,u,u,u,u> - // step 1 : <2,3,u,u,u,u,u,u> - // step 2 : <4,5,6,7,u,u,u,u> - for (int i = 0; i < NumVecElts; ++i) - if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || - (i >= NumMaskElts && !(Mask[i] < 0))) - return SDValue(); - - PreOp = CurOp; - } - unsigned Opcode; - bool IsIntrinsic = false; - - switch (Op) { - default: - llvm_unreachable("Unexpected operator for across vector reduction"); - case ISD::ADD: - Opcode = AArch64ISD::UADDV; - break; - case ISD::SMAX: - Opcode = AArch64ISD::SMAXV; - break; - case ISD::UMAX: - Opcode = AArch64ISD::UMAXV; - break; - case ISD::SMIN: - Opcode = AArch64ISD::SMINV; - break; - case ISD::UMIN: - Opcode = AArch64ISD::UMINV; - break; - case ISD::FMAXNUM: - Opcode = Intrinsic::aarch64_neon_fmaxnmv; - IsIntrinsic = true; - break; - case ISD::FMINNUM: - Opcode = Intrinsic::aarch64_neon_fminnmv; - IsIntrinsic = true; - break; - } - SDLoc DL(N); - - return IsIntrinsic - ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), - DAG.getConstant(Opcode, DL, MVT::i32), PreOp) - : DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), - DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), - DAG.getConstant(0, DL, MVT::i64)); -} - -/// Target-specific DAG combine for the across vector min/max reductions. -/// This function specifically handles the final clean-up step of the vector -/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which narrows down and finds the final min/max value from all -/// elements of the vector. -/// For example, for a <16 x i8> vector : -/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> -/// %smax0 = smax %arr, svn0 -/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax1 = smax %smax0, %svn1 -/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax2 = smax %smax1, svn2 -/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %sc = setcc %smax2, %svn3, gt -/// %n0 = extract_vector_elt %sc, #0 -/// %n1 = extract_vector_elt %smax2, #0 -/// %n2 = extract_vector_elt $smax2, #1 -/// %result = select %n0, %n1, n2 -/// becomes : -/// %1 = smaxv %0 -/// %result = extract_vector_elt %1, 0 -static SDValue -performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue IfTrue = N->getOperand(1); - SDValue IfFalse = N->getOperand(2); - - // Check if the SELECT merges up the final result of the min/max - // from a vector. - if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Expect N0 is fed by SETCC. - SDValue SetCC = N0.getOperand(0); - EVT SetCCVT = SetCC.getValueType(); - if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || - SetCCVT.getVectorElementType() != MVT::i1) - return SDValue(); - - SDValue VectorOp = SetCC.getOperand(0); - unsigned Op = VectorOp->getOpcode(); - // Check if the input vector is fed by the operator we want to handle. - if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && - Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) - return SDValue(); - - EVT VTy = VectorOp.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (EltTy != MVT::f32) - return SDValue(); - } else { - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - } - - // Check if extracting from the same vector. - // For example, - // %sc = setcc %vector, %svn1, gt - // %n0 = extract_vector_elt %sc, #0 - // %n1 = extract_vector_elt %vector, #0 - // %n2 = extract_vector_elt $vector, #1 - if (!(VectorOp == IfTrue->getOperand(0) && - VectorOp == IfFalse->getOperand(0))) - return SDValue(); - - // Check if the condition code is matched with the operator type. - ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); - if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || - (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || - (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || - (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || - (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && - CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && - CC != ISD::SETGE) || - (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && - CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && - CC != ISD::SETLE)) - return SDValue(); - - // Expect to check only lane 0 from the vector SETCC. - if (!isNullConstant(N0.getOperand(1))) - return SDValue(); - - // Expect to extract the true value from lane 0. - if (!isNullConstant(IfTrue.getOperand(1))) - return SDValue(); - - // Expect to extract the false value from lane 1. - if (!isOneConstant(IfFalse.getOperand(1))) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); -} - -/// Target-specific DAG combine for the across vector add reduction. -/// This function specifically handles the final clean-up step of the vector -/// add reduction produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which adds all elements of a vector together. -/// For example, for a <4 x i32> vector : -/// %1 = vector_shuffle %0, <2,3,u,u> -/// %2 = add %0, %1 -/// %3 = vector_shuffle %2, <1,u,u,u> -/// %4 = add %2, %3 -/// %result = extract_vector_elt %4, 0 -/// becomes : -/// %0 = uaddv %0 -/// %result = extract_vector_elt %0, 0 -static SDValue -performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // Check if the input vector is fed by the ADD. - if (N0->getOpcode() != ISD::ADD) - return SDValue(); - - // The vector extract idx must constant zero because we only expect the final - // result of the reduction is placed in lane 0. - if (!isNullConstant(N1)) - return SDValue(); - - EVT VTy = N0.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); -} /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. @@ -10428,12 +10229,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: { - SDValue RV = performSelectCombine(N, DCI); - if (!RV.getNode()) - RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); - return RV; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: @@ -10455,8 +10252,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); - case ISD::EXTRACT_VECTOR_ELT: - return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -10676,6 +10471,14 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); + return; + case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 89db566c219c..ecc2517fb288 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -568,6 +568,7 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7c6f55c06bce..43569af04347 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -769,3 +769,28 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() { unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { return ST->getMaxPrefetchIterationsAhead(); } + +bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + return false; + case Instruction::Add: + return ScalarBits * Ty->getVectorNumElements() >= 128; + case Instruction::ICmp: + return (ScalarBits < 64) && + (ScalarBits * Ty->getVectorNumElements() >= 128); + case Instruction::FCmp: + return Flags.NoNaN; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 280d97f3c502..d0299149c38c 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -145,6 +145,9 @@ public: bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } + + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; /// @} }; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 7c99752b881f..c3ac796a0a44 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1707,10 +1707,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, // FIXME: Look for on separate components if (Src.getOpcode() == ISD::FNEG) { - Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI); + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = Src.getOperand(0); } + if (Src.getOpcode() == ISD::BUILD_VECTOR) { + unsigned VecMods = Mods; + + SDValue Lo = Src.getOperand(0); + SDValue Hi = Src.getOperand(1); + + if (Lo.getOpcode() == ISD::FNEG) { + Lo = Lo.getOperand(0); + Mods ^= SISrcMods::NEG; + } + + if (Hi.getOpcode() == ISD::FNEG) { + Hi = Hi.getOperand(0); + Mods ^= SISrcMods::NEG_HI; + } + + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { + // Really a scalar input. Just select from the low half of the register to + // avoid packing. + + Src = Lo; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + Mods = VecMods; + } + // Packed instructions do not have abs modifiers. // FIXME: Handle abs/neg of individual components. diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index bed7d326b3dd..e543cae07ada 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -289,6 +289,10 @@ public: return getGeneration() >= GFX9; } + bool hasMin3Max3_16() const { + return getGeneration() >= GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 48a14e4dbea2..286be355bc14 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4491,7 +4491,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - VT != MVT::f64) { + VT != MVT::f64 && + ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 065fd09eb356..38a16b525a75 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -765,7 +765,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. @@ -796,7 +796,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addReg(MFI->getFrameOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); } @@ -869,7 +869,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. @@ -892,10 +892,10 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(0) // offset + .addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getFrameOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 8820e294562b..06cfc95be96a 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -654,11 +654,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()) + .addReg(MFI->getFrameOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()); + .addReg(MFI->getFrameOffsetReg()); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) @@ -715,11 +715,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getScratchWaveOffsetReg()) // soffset - .addImm(i * 4) // offset + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getFrameOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); } } @@ -806,11 +806,11 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()) + .addReg(MFI->getFrameOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()); + .addReg(MFI->getFrameOffsetReg()); } auto MIB = @@ -853,10 +853,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getScratchWaveOffsetReg()) // soffset - .addImm(i * 4) // offset + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getFrameOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index ffa6c60d6b1f..c0b5069948fb 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -300,10 +300,19 @@ def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>; def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>; def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>; -} + +def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmin3>; +def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmin3>; +def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumin3>; + +def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmax3>; +def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmax3>; +def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumax3>; +} // End SubtargetPredicate = isGFX9 //===----------------------------------------------------------------------===// @@ -509,6 +518,15 @@ defm V_OR3_B32 : VOP3_Real_vi <0x202>; defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; defm V_XAD_U32 : VOP3_Real_vi <0x1f3>; + +defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>; +defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>; +defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>; + +defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>; +defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>; +defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>; + defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 8c680cdf9b47..b1f059835ff5 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -345,25 +345,10 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { I.setDesc(TII.get(COPY)); return selectCopy(I, TII, MRI, TRI, RBI); } - case G_ADD: case G_GEP: I.setDesc(TII.get(ARM::ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; - case G_SUB: - I.setDesc(TII.get(ARM::SUBrr)); - MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); - break; - case G_MUL: - if (TII.getSubtarget().hasV6Ops()) { - I.setDesc(TII.get(ARM::MUL)); - } else { - assert(TII.getSubtarget().useMulOps() && "Unsupported target"); - I.setDesc(TII.get(ARM::MULv5)); - MIB->getOperand(0).setIsEarlyClobber(true); - } - MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); - break; case G_FRAME_INDEX: // Add 0 to the given frame index and hope it will eventually be folded into // the user(s). diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index d0fd366ab9ed..1a17d4e33e4f 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -571,8 +571,7 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, GPRsNoLRSP.reset(ARM::LR); GPRsNoLRSP.reset(ARM::SP); GPRsNoLRSP.reset(ARM::PC); - for (int Register = GPRsNoLRSP.find_first(); Register != -1; - Register = GPRsNoLRSP.find_next(Register)) { + for (unsigned Register : GPRsNoLRSP.set_bits()) { if (!UsedRegs.contains(Register)) { // Remember the first pop-friendly register and exit. if (PopFriendly.test(Register)) { diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index ae58c26e145a..1597057ad63f 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -386,7 +386,7 @@ void RegDefsUses::setCallerSaved(const MachineInstr &MI) { void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) { BitVector AllocSet = TRI.getAllocatableSet(MF); - for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R)) + for (unsigned R : AllocSet.set_bits()) for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI) AllocSet.set(*AI); diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 40bfe3a449f7..57a1d373c88c 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1765,31 +1765,36 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, // Check whether the frame pointer register is allocated. If so, make sure it // is spilled to the correct offset. if (needsFP(MF)) { - HasGPSaveArea = true; - int FI = PFI->getFramePointerSaveIndex(); assert(FI && "No Frame Pointer Save Slot!"); - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + // FP is R31/X31, so no need to update MinGPR/MinG8R. + HasGPSaveArea = true; } if (PFI->usesPICBase()) { - HasGPSaveArea = true; - int FI = PFI->getPICBasePointerSaveIndex(); assert(FI && "No PIC Base Pointer Save Slot!"); - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + + MinGPR = std::min<unsigned>(MinGPR, PPC::R30); + HasGPSaveArea = true; } const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); if (RegInfo->hasBasePointer(MF)) { - HasGPSaveArea = true; - int FI = PFI->getBasePointerSaveIndex(); assert(FI && "No Base Pointer Save Slot!"); - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + + unsigned BP = RegInfo->getBaseRegister(MF); + if (PPC::G8RCRegClass.contains(BP)) { + MinG8R = std::min<unsigned>(MinG8R, BP); + HasG8SaveArea = true; + } else if (PPC::GPRCRegClass.contains(BP)) { + MinGPR = std::min<unsigned>(MinGPR, BP); + HasGPSaveArea = true; + } } // General register save area starts right below the Floating-point diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 17bdd595da10..144aea850833 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -410,6 +410,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + // Comparisons that require checking two conditions. setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); @@ -8184,6 +8189,26 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return Flags; } +SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to + // the beginning of the argument list. + int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; + SDLoc DL(Op); + switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { + case Intrinsic::ppc_cfence: { + assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); + return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, + Op.getOperand(ArgStart + 1))), + 0); + } + default: + break; + } + return SDValue(); +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8649,6 +8674,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); } } @@ -8753,12 +8781,19 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { - if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) + if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) + return Builder.CreateCall( + Intrinsic::getDeclaration( + Builder.GetInsertBlock()->getParent()->getParent(), + Intrinsic::ppc_cfence, {Inst->getType()}), + {Inst}); + // FIXME: Can use isync for rmw operation. return callIntrinsic(Builder, Intrinsic::ppc_lwsync); - // FIXME: this is too conservative, a dependent branch + isync is enough. - // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and - // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html - // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + } return nullptr; } diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 4fc744257262..acb77943b118 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -905,6 +905,7 @@ namespace llvm { SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index a8433919f0f3..a3f894c81a01 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -983,6 +983,10 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), [(set i64:$rD, (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, isPPC64; + +let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in +def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>; + def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), (ADD8TLS $in, tglobaltlsaddr:$g)>; def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 790a8902b3d2..3afcec1248d5 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1873,6 +1873,8 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { } bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + auto &MBB = *MI.getParent(); + auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && @@ -1920,6 +1922,17 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(Opcode)); return true; } + case PPC::CFENCE8: { + auto Val = MI.getOperand(0).getReg(); + BuildMI(MBB, MI, DL, get(PPC::CMPW), PPC::CR7).addReg(Val).addReg(Val); + BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP)) + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR7) + .addImm(1); + MI.setDesc(get(PPC::ISYNC)); + MI.RemoveOperand(0); + return true; + } } return false; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 1af5e7f28342..0766cfe4a987 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -1223,9 +1223,15 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { // FIXME: should be able to write a pattern for PPCcondbranch, but can't use // a two-value operand where a dag node expects two operands. :( let isCodeGenOnly = 1 in { - def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), - "b${cond:cc}${cond:pm} ${cond:reg}, $dst" - /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + class BCC_class : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}${cond:pm} ${cond:reg}, $dst" + /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + def BCC : BCC_class; + + // The same as BCC, except that it's not a terminator. Used for introducing + // control flow dependency without creating new blocks. + let isTerminator = 0 in def CTRL_DEP : BCC_class; + def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst), "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index f56b238f91e6..6a3dc6799c43 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -325,6 +325,30 @@ int SystemZTTIImpl::getArithmeticInstrCost( unsigned ScalarBits = Ty->getScalarSizeInBits(); + // Div with a constant which is a power of 2 will be converted by + // DAGCombiner to use shifts. With vector shift-element instructions, a + // vector sdiv costs about as much as a scalar one. + const unsigned SDivCostEstimate = 4; + bool SDivPow2 = false; + bool UDivPow2 = false; + if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) && + Args.size() == 2) { + const ConstantInt *CI = nullptr; + if (const Constant *C = dyn_cast<Constant>(Args[1])) { + if (C->getType()->isVectorTy()) + CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue()); + else + CI = dyn_cast<const ConstantInt>(C); + } + if (CI != nullptr && + (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) { + if (Opcode == Instruction::SDiv) + SDivPow2 = true; + else + UDivPow2 = true; + } + } + if (Ty->isVectorTy()) { assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type."); unsigned VF = Ty->getVectorNumElements(); @@ -333,10 +357,13 @@ int SystemZTTIImpl::getArithmeticInstrCost( // These vector operations are custom handled, but are still supported // with one instruction per vector, regardless of element size. if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || - Opcode == Instruction::AShr) { + Opcode == Instruction::AShr || UDivPow2) { return NumVectors; } + if (SDivPow2) + return (NumVectors * SDivCostEstimate); + // These FP operations are supported with a single vector instruction for // double (base implementation assumes float generally costs 2). For // FP128, the scalar cost is 1, and there is no overhead since the values @@ -395,6 +422,11 @@ int SystemZTTIImpl::getArithmeticInstrCost( // 2 * ipm sequences ; xor ; shift ; compare return 7; + if (UDivPow2) + return 1; + if (SDivPow2) + return SDivCostEstimate; + // An extra extension for narrow types is needed. if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem)) // sext of op(s) for narrow types diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp index 5fd4a8d1949e..ba39b6cdb568 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -140,8 +140,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { // Check if it's possible to reuse any of the used colors. if (!MRI->isLiveIn(Old)) - for (int C(UsedColors.find_first()); C != -1; - C = UsedColors.find_next(C)) { + for (unsigned C : UsedColors.set_bits()) { if (MRI->getRegClass(SortedIntervals[C]->reg) != RC) continue; for (LiveInterval *OtherLI : Assignments[C]) diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt index 8e8e5fd1eff1..54619589c341 100644 --- a/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -33,9 +33,6 @@ built-in-setjmp.c pr60003.c # Error in the program / unsupported by Clang. -scal-to-vec1.c -scal-to-vec2.c -scal-to-vec3.c 20000822-1.c 20010209-1.c 20010605-1.c diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 3a421fe77392..784c3a6557ff 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -235,8 +235,6 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; -def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", - "LEA instruction with 3 ops or certain registers is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat @@ -482,7 +480,6 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, - FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 9f649dad8bc0..2cd4c1a3e7b3 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -27,26 +27,20 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -namespace llvm { -void initializeFixupLEAPassPass(PassRegistry &); -} - -#define FIXUPLEA_DESC "X86 LEA Fixup" -#define FIXUPLEA_NAME "x86-fixup-LEAs" - -#define DEBUG_TYPE FIXUPLEA_NAME +#define DEBUG_TYPE "x86-fixup-LEAs" STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - + static char ID; /// \brief Loop over all of the instructions in the basic block /// replacing applicable instructions with LEA instructions, /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); + StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -68,22 +62,6 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); - - /// \brief Given a LEA instruction which is unprofitable - /// on SNB+ try to replace it with other instructions. - /// According to Intel's Optimization Reference Manual: - /// " For LEA instructions with three source operands and some specific - /// situations, instruction latency has increased to 3 cycles, and must - /// dispatch via port 1: - /// - LEA that has all three source operands: base, index, and offset - /// - LEA that uses base and index registers where the base is EBP, RBP, - /// or R13 - /// - LEA that uses RIP relative addressing mode - /// - LEA that uses 16-bit addressing mode " - /// This function currently handles the first 2 cases only. - MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineFunction::iterator MFI); - /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg /// and convert them to INC or DEC respectively. bool fixupIncDec(MachineBasicBlock::iterator &I, @@ -107,13 +85,7 @@ class FixupLEAPass : public MachineFunctionPass { MachineBasicBlock::iterator &MBBI) const; public: - static char ID; - - StringRef getPassName() const override { return FIXUPLEA_DESC; } - - FixupLEAPass() : MachineFunctionPass(ID) { - initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); - } + FixupLEAPass() : MachineFunctionPass(ID) {} /// \brief Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -132,11 +104,8 @@ private: bool OptIncDec; bool OptLEA; }; -} - char FixupLEAPass::ID = 0; - -INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) +} MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, @@ -199,7 +168,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); - OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); + OptLEA = ST.LEAusesAG() || ST.slowLEA(); if (!OptLEA && !OptIncDec) return false; @@ -273,64 +242,9 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int Opcode) { - return Opcode == X86::LEA16r || Opcode == X86::LEA32r || - Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; -} - -static inline bool isInefficientLEAReg(unsigned int Reg) { - return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13; -} - -static inline bool isRegOperand(const MachineOperand &Op) { - return Op.isReg() && Op.getReg() != X86::NoRegister; -} -/// hasIneffecientLEARegs - LEA that uses base and index registers -/// where the base is EBP, RBP, or R13 -static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, - const MachineOperand &Index) { - return Base.isReg() && isInefficientLEAReg(Base.getReg()) && - isRegOperand(Index); -} - -static inline bool hasLEAOffset(const MachineOperand &Offset) { - return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); -} - -// LEA instruction that has all three operands: offset, base and index -static inline bool isThreeOperandsLEA(const MachineOperand &Base, - const MachineOperand &Index, - const MachineOperand &Offset) { - return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset); -} - -static inline int getADDrrFromLEA(int LEAOpcode) { - switch (LEAOpcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - return X86::ADD16rr; - case X86::LEA32r: - return X86::ADD32rr; - case X86::LEA64_32r: - case X86::LEA64r: - return X86::ADD64rr; - } -} - -static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { - bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); - switch (LEAOpcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; - case X86::LEA32r: - case X86::LEA64_32r: - return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; - case X86::LEA64r: - return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32; - } +static inline bool isLEA(const int opcode) { + return opcode == X86::LEA16r || opcode == X86::LEA32r || + opcode == X86::LEA64r || opcode == X86::LEA64_32r; } /// isLEASimpleIncOrDec - Does this LEA have one these forms: @@ -423,8 +337,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr &MI = *I; - const int Opcode = MI.getOpcode(); - if (!isLEA(Opcode)) + const int opcode = MI.getOpcode(); + if (!isLEA(opcode)) return; if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -436,142 +350,53 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; if (MI.getOperand(2).getImm() > 1) return; + int addrr_opcode, addri_opcode; + switch (opcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + addrr_opcode = X86::ADD16rr; + addri_opcode = X86::ADD16ri; + break; + case X86::LEA32r: + addrr_opcode = X86::ADD32rr; + addri_opcode = X86::ADD32ri; + break; + case X86::LEA64_32r: + case X86::LEA64r: + addrr_opcode = X86::ADD64rr; + addri_opcode = X86::ADD64ri32; + break; + } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); MachineInstr *NewMI = nullptr; + const MachineOperand &Dst = MI.getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { - const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); - const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1); - NewMI = - BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); + const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); + const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) + .add(Dst) + .add(Src1) + .add(Src2); + MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } // Make ADD instruction for immediate if (MI.getOperand(4).getImm() != 0) { - const MCInstrDesc &ADDri = - TII->get(getADDriFromLEA(Opcode, MI.getOperand(4))); const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); - NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) + NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) + .add(Dst) .add(SrcR) .addImm(MI.getOperand(4).getImm()); + MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } if (NewMI) { MFI->erase(I); - I = NewMI; - } -} - -MachineInstr * -FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineFunction::iterator MFI) { - - const int LEAOpcode = MI.getOpcode(); - if (!isLEA(LEAOpcode)) - return nullptr; - - const MachineOperand &Dst = MI.getOperand(0); - const MachineOperand &Base = MI.getOperand(1); - const MachineOperand &Scale = MI.getOperand(2); - const MachineOperand &Index = MI.getOperand(3); - const MachineOperand &Offset = MI.getOperand(4); - const MachineOperand &Segment = MI.getOperand(5); - - if (!(isThreeOperandsLEA(Base, Index, Offset) || - hasInefficientLEABaseReg(Base, Index)) || - !TII->isSafeToClobberEFLAGS(*MFI, MI) || - Segment.getReg() != X86::NoRegister) - return nullptr; - - unsigned int DstR = Dst.getReg(); - unsigned int BaseR = Base.getReg(); - unsigned int IndexR = Index.getReg(); - unsigned SSDstR = - (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; - bool IsScale1 = Scale.getImm() == 1; - bool IsInefficientBase = isInefficientLEAReg(BaseR); - bool IsInefficientIndex = isInefficientLEAReg(IndexR); - - // Skip these cases since it takes more than 2 instructions - // to replace the LEA instruction. - if (IsInefficientBase && SSDstR == BaseR && !IsScale1) - return nullptr; - if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && - (IsInefficientIndex || !IsScale1)) - return nullptr; - - const DebugLoc DL = MI.getDebugLoc(); - const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); - const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); - - DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); - DEBUG(dbgs() << "FixLEA: Replaced by: ";); - - // First try to replace LEA with one or two (for the 3-op LEA case) - // add instructions: - // 1.lea (%base,%index,1), %base => add %index,%base - // 2.lea (%base,%index,1), %index => add %base,%index - if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { - const MachineOperand &Src = DstR == BaseR ? Index : Base; - MachineInstr *NewMI = - BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); - DEBUG(NewMI->dump();); - // Create ADD instruction for the Offset in case of 3-Ops LEA. - if (hasLEAOffset(Offset)) { - NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - DEBUG(NewMI->dump();); - } - return NewMI; - } - // If the base is inefficient try switching the index and base operands, - // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: - // lea offset(%base,%index,scale),%dst => - // lea (%base,%index,scale); add offset,%dst - if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .add(IsInefficientBase ? Index : Base) - .add(Scale) - .add(IsInefficientBase ? Base : Index) - .addImm(0) - .add(Segment); - DEBUG(NewMI->dump();); - // Create ADD instruction for the Offset in case of 3-Ops LEA. - if (hasLEAOffset(Offset)) { - NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - DEBUG(NewMI->dump();); - } - return NewMI; - } - // Handle the rest of the cases with inefficient base register: - assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); - assert(IsInefficientBase && "efficient base should be handled already!"); - - // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst - if (IsScale1 && !hasLEAOffset(Offset)) { - TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill()); - DEBUG(MI.getPrevNode()->dump();); - - MachineInstr *NewMI = - BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); - DEBUG(NewMI->dump();); - return NewMI; + I = static_cast<MachineBasicBlock::iterator>(NewMI); } - // lea offset(%base,%index,scale), %dst => - // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .addReg(0) - .add(Scale) - .add(Index) - .add(Offset) - .add(Segment); - DEBUG(NewMI->dump();); - - NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); - DEBUG(NewMI->dump();); - return NewMI; } bool FixupLEAPass::processBasicBlock(MachineFunction &MF, @@ -585,16 +410,8 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, if (OptLEA) { if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); - - else { - if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { - MFI->erase(I); - I = NewMI; - } - } else - processInstruction(I, MFI); - } + else + processInstruction(I, MFI); } } return false; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index de58d719acb4..5eb5ad52840a 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -19,6 +19,7 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -72,6 +73,9 @@ private: bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + const X86TargetMachine &TM; const X86Subtarget &STI; const X86InstrInfo &TII; @@ -243,6 +247,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectCmp(I, MRI, MF)) return true; + if (selectUadde(I, MRI, MF)) + return true; return false; } @@ -564,6 +570,66 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, return true; } +bool X86InstructionSelector::selectUadde(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_UADDE) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned CarryOutReg = I.getOperand(1).getReg(); + const unsigned Op0Reg = I.getOperand(2).getReg(); + const unsigned Op1Reg = I.getOperand(3).getReg(); + unsigned CarryInReg = I.getOperand(4).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + + if (DstTy != LLT::scalar(32)) + return false; + + // find CarryIn def instruction. + MachineInstr *Def = MRI.getVRegDef(CarryInReg); + while (Def->getOpcode() == TargetOpcode::G_TRUNC) { + CarryInReg = Def->getOperand(1).getReg(); + Def = MRI.getVRegDef(CarryInReg); + } + + unsigned Opcode; + if (Def->getOpcode() == TargetOpcode::G_UADDE) { + // carry set by prev ADD. + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), X86::EFLAGS) + .addReg(CarryInReg); + + if (!RBI.constrainGenericRegister(CarryInReg, X86::GR32RegClass, MRI)) + return false; + + Opcode = X86::ADC32rr; + } else if (auto val = getConstantVRegVal(CarryInReg, MRI)) { + // carry is constant, support only 0. + if (*val != 0) + return false; + + Opcode = X86::ADD32rr; + } else + return false; + + MachineInstr &AddInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg) + .addReg(Op0Reg) + .addReg(Op1Reg); + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg) + .addReg(X86::EFLAGS); + + if (!constrainSelectedInstRegOperands(AddInst, TII, TRI, RBI) || + !RBI.constrainGenericRegister(CarryOutReg, X86::GR32RegClass, MRI)) + return false; + + I.eraseFromParent(); + return true; +} + InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index cf26238c0239..8ce240714f17 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -59,6 +59,11 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32}) setAction({BinOp, Ty}, Legal); + for (unsigned Op : {G_UADDE}) { + setAction({Op, s32}, Legal); + setAction({Op, 1, s1}, Legal); + } + for (unsigned MemOp : {G_LOAD, G_STORE}) { for (auto Ty : {s8, s16, s32, p0}) setAction({MemOp, Ty}, Legal); diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 02be95e2e556..de1514243aeb 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -253,11 +253,6 @@ protected: /// True if the LEA instruction with certain arguments is slow bool SlowLEA; - /// True if the LEA instruction has all three source operands: base, index, - /// and offset or if the LEA instruction uses base and index registers where - /// the base is EBP, RBP,or R13 - bool Slow3OpsLEA; - /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; @@ -495,7 +490,6 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } - bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index c6a90725d89c..9a82e6e50463 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -61,7 +61,6 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); -void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); } // end namespace llvm @@ -76,7 +75,6 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); - initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); } diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 80e18161a94b..8566bd91c89e 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1392,6 +1392,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll + static const CostTblEntry AVX512BWCostTbl[] = { + { ISD::BITREVERSE, MVT::v8i64, 5 }, + { ISD::BITREVERSE, MVT::v16i32, 5 }, + { ISD::BITREVERSE, MVT::v32i16, 5 }, + { ISD::BITREVERSE, MVT::v64i8, 5 }, + }; + static const CostTblEntry AVX512CostTbl[] = { + { ISD::BITREVERSE, MVT::v8i64, 36 }, + { ISD::BITREVERSE, MVT::v16i32, 24 }, + }; static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, { ISD::BITREVERSE, MVT::v8i32, 4 }, @@ -1550,6 +1560,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, MVT MTy = LT.second; // Attempt to lookup cost. + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasXOP()) if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) return LT.first * Entry->Cost; diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 4480220f2cd4..417d57f7625b 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -347,6 +347,27 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape, return FrameTy; } +// We need to make room to insert a spill after initial PHIs, but before +// catchswitch instruction. Placing it before violates the requirement that +// catchswitch, like all other EHPads must be the first nonPHI in a block. +// +// Split away catchswitch into a separate block and insert in its place: +// +// cleanuppad <InsertPt> cleanupret. +// +// cleanupret instruction will act as an insert point for the spill. +static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { + BasicBlock *CurrentBlock = CatchSwitch->getParent(); + BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch); + CurrentBlock->getTerminator()->eraseFromParent(); + + auto *CleanupPad = + CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock); + auto *CleanupRet = + CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock); + return CleanupRet; +} + // Replace all alloca and SSA values that are accessed across suspend points // with GetElementPointer from coroutine frame + loads and stores. Create an // AllocaSpillBB that will become the new entry block for the resume parts of @@ -437,8 +458,11 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { InsertPt = NewBB->getTerminator(); } else if (dyn_cast<PHINode>(CurrentValue)) { // Skip the PHINodes and EH pads instructions. - InsertPt = - &*cast<Instruction>(E.def())->getParent()->getFirstInsertionPt(); + BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent(); + if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) + InsertPt = splitBeforeCatchSwitch(CSI); + else + InsertPt = &*DefBlock->getFirstInsertionPt(); } else { // For all other values, the spill is placed immediately after // the definition. diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 1424f61fe701..f88a2c6acc3f 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -74,6 +74,27 @@ static inline unsigned getComplexity(Value *V) { return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2; } +/// Predicate canonicalization reduces the number of patterns that need to be +/// matched by other transforms. For example, we may swap the operands of a +/// conditional branch or select to create a compare with a canonical (inverted) +/// predicate which is then more likely to be matched with other values. +static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) { + switch (Pred) { + case CmpInst::ICMP_NE: + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + // TODO: There are 16 FCMP predicates. Should others be (not) canonical? + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_OGE: + return false; + default: + return true; + } +} + /// \brief Add one to a Constant static inline Constant *AddOne(Constant *C) { return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 65b1148cb03b..7ed9fd566b37 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2210,37 +2210,17 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { return &BI; } - // Canonicalize fcmp_one -> fcmp_oeq - FCmpInst::Predicate FPred; Value *Y; - if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))), - TrueDest, FalseDest))) { - // TODO: Why are we only transforming these 3 predicates? - if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE || - FPred == FCmpInst::FCMP_OGE) { - FCmpInst *Cond = cast<FCmpInst>(BI.getCondition()); - Cond->setPredicate(FCmpInst::getInversePredicate(FPred)); - - // Swap Destinations and condition. - BI.swapSuccessors(); - Worklist.Add(Cond); - return &BI; - } - } - - // Canonicalize icmp_ne -> icmp_eq - ICmpInst::Predicate IPred; - if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))), - TrueDest, FalseDest))) { - if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE || - IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE || - IPred == ICmpInst::ICMP_SGE) { - ICmpInst *Cond = cast<ICmpInst>(BI.getCondition()); - Cond->setPredicate(ICmpInst::getInversePredicate(IPred)); - // Swap Destinations and condition. - BI.swapSuccessors(); - Worklist.Add(Cond); - return &BI; - } + // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq. + CmpInst::Predicate Pred; + if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest, + FalseDest)) && + !isCanonicalPredicate(Pred)) { + // Swap destinations and condition. + CmpInst *Cond = cast<CmpInst>(BI.getCondition()); + Cond->setPredicate(CmpInst::getInversePredicate(Pred)); + BI.swapSuccessors(); + Worklist.Add(Cond); + return &BI; } return nullptr; @@ -3053,7 +3033,10 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, } } - InstrsForInstCombineWorklist.push_back(Inst); + // Skip processing debug intrinsics in InstCombine. Processing these call instructions + // consumes non-trivial amount of time and provides no value for the optimization. + if (!isa<DbgInfoIntrinsic>(Inst)) + InstrsForInstCombineWorklist.push_back(Inst); } // Recursively visit successors. If this is a branch or switch on a diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 340c81fed0fd..37b9c4b1094e 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -546,7 +546,7 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, // If there are escaping uses of invariant.start instruction, the load maybe // non-invariant. if (!II || II->getIntrinsicID() != Intrinsic::invariant_start || - II->hasNUsesOrMore(1)) + !II->use_empty()) continue; unsigned InvariantSizeInBits = cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8; diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 6693a26e8890..cb6223b070a6 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1292,13 +1292,15 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { BasicBlock *PH = CurLoop->getLoopPreheader(); Value *InitX = PhiX->getIncomingValueForBlock(PH); // If we check X != 0 before entering the loop we don't need a zero - // check in CTLZ intrinsic. - if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) - if (BranchInst *PreCondBr = - dyn_cast<BranchInst>(PreCondBB->getTerminator())) { - if (matchCondition(PreCondBr, PH) == InitX) - ZeroCheck = true; - } + // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the + // loop (if it is used we count CTLZ(X >> 1)). + if (!IsCntPhiUsedOutsideLoop) + if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) + if (BranchInst *PreCondBr = + dyn_cast<BranchInst>(PreCondBB->getTerminator())) { + if (matchCondition(PreCondBr, PH) == InitX) + ZeroCheck = true; + } // Check if CTLZ intrinsic is profitable. Assume it is always profitable // if we delete the loop (the loop has only 6 instructions): diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index ccedb98d7fa1..bd1f21c69eba 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3902,8 +3902,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Compute the difference between the two. int64_t Imm = (uint64_t)JImm - M->first; - for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; - LUIdx = UsedByIndices.find_next(LUIdx)) + for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 5e0a705782ea..0e7572f8d2e5 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -642,6 +642,7 @@ private: void updateProcessedCount(Value *V); void verifyMemoryCongruency() const; void verifyIterationSettled(Function &F); + void verifyStoreExpressions() const; bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const; BasicBlock *getBlockForValue(Value *V) const; void deleteExpression(const Expression *E) const; @@ -2003,7 +2004,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // If it's not a memory use, set the MemoryAccess equivalence auto *InstMA = dyn_cast_or_null<MemoryDef>(MSSA->getMemoryAccess(I)); - bool InstWasMemoryLeader = InstMA && OldClass->getMemoryLeader() == InstMA; if (InstMA) moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass); ValueToClass[I] = NewClass; @@ -2029,31 +2029,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, if (OldClass->getStoredValue()) OldClass->setStoredValue(nullptr); } - // If we destroy the old access leader and it's a store, we have to - // effectively destroy the congruence class. When it comes to scalars, - // anything with the same value is as good as any other. That means that - // one leader is as good as another, and as long as you have some leader for - // the value, you are good.. When it comes to *memory states*, only one - // particular thing really represents the definition of a given memory - // state. Once it goes away, we need to re-evaluate which pieces of memory - // are really still equivalent. The best way to do this is to re-value - // number things. The only way to really make that happen is to destroy the - // rest of the class. In order to effectively destroy the class, we reset - // ExpressionToClass for each by using the ValueToExpression mapping. The - // members later get marked as touched due to the leader change. We will - // create new congruence classes, and the pieces that are still equivalent - // will end back together in a new class. If this becomes too expensive, it - // is possible to use a versioning scheme for the congruence classes to - // avoid the expressions finding this old class. Note that the situation is - // different for memory phis, becuase they are evaluated anew each time, and - // they become equal not by hashing, but by seeing if all operands are the - // same (or only one is reachable). - if (OldClass->getStoreCount() > 0 && InstWasMemoryLeader) { - DEBUG(dbgs() << "Kicking everything out of class " << OldClass->getID() - << " because MemoryAccess leader changed"); - for (auto Member : *OldClass) - ExpressionToClass.erase(ValueToExpression.lookup(Member)); - } OldClass->setLeader(getNextValueLeader(OldClass)); OldClass->resetNextLeader(); markValueLeaderChangeTouched(OldClass); @@ -2062,7 +2037,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // Perform congruence finding on a given value numbering expression. void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { - ValueToExpression[I] = E; // This is guaranteed to return something, since it will at least find // TOP. @@ -2132,6 +2106,18 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { if (auto *CI = dyn_cast<CmpInst>(I)) markPredicateUsersTouched(CI); } + // If we changed the class of the store, we want to ensure nothing finds the + // old store expression. In particular, loads do not compare against stored + // value, so they will find old store expressions (and associated class + // mappings) if we leave them in the table. + if (ClassChanged && isa<StoreExpression>(E)) { + auto *OldE = ValueToExpression.lookup(I); + // It could just be that the old class died. We don't want to erase it if we + // just moved classes. + if (OldE && isa<StoreExpression>(OldE) && !OldE->equals(*E)) + ExpressionToClass.erase(OldE); + } + ValueToExpression[I] = E; } // Process the fact that Edge (from, to) is reachable, including marking @@ -2651,6 +2637,30 @@ void NewGVN::verifyIterationSettled(Function &F) { #endif } +// Verify that for each store expression in the expression to class mapping, +// only the latest appears, and multiple ones do not appear. +// Because loads do not use the stored value when doing equality with stores, +// if we don't erase the old store expressions from the table, a load can find +// a no-longer valid StoreExpression. +void NewGVN::verifyStoreExpressions() const { +#ifndef NDEBUG + DenseSet<std::pair<const Value *, const Value *>> StoreExpressionSet; + for (const auto &KV : ExpressionToClass) { + if (auto *SE = dyn_cast<StoreExpression>(KV.first)) { + // Make sure a version that will conflict with loads is not already there + auto Res = + StoreExpressionSet.insert({SE->getOperand(0), SE->getMemoryLeader()}); + assert(Res.second && + "Stored expression conflict exists in expression table"); + auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst()); + assert(ValueExpr && ValueExpr->equals(*SE) && + "StoreExpression in ExpressionToClass is not latest " + "StoreExpression for value"); + } + } +#endif +} + // This is the main value numbering loop, it iterates over the initial touched // instruction set, propagating value numbers, marking things touched, etc, // until the set of touched instructions is completely empty. @@ -2668,8 +2678,7 @@ void NewGVN::iterateTouchedInstructions() { // TODO: As we hit a new block, we should push and pop equalities into a // table lookupOperandLeader can use, to catch things PredicateInfo // might miss, like edge-only equivalences. - for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1; - InstrNum = TouchedInstructions.find_next(InstrNum)) { + for (unsigned InstrNum : TouchedInstructions.set_bits()) { // This instruction was found to be dead. We don't bother looking // at it again. @@ -2776,6 +2785,7 @@ bool NewGVN::runGVN() { iterateTouchedInstructions(); verifyMemoryCongruency(); verifyIterationSettled(F); + verifyStoreExpressions(); Changed |= eliminateInstructions(F); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index ef29d4141600..53320bff0883 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -1922,7 +1922,7 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { // User must be a binary operator with one or more uses. Instruction *User = I->user_back(); - if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1)) + if (!isa<BinaryOperator>(User) || User->use_empty()) return nullptr; unsigned UserOpcode = User->getOpcode(); diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 4f608c97147d..b32a61a7e8f8 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -1,4 +1,4 @@ -//===-- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow --------===// +//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// // // The LLVM Compiler Infrastructure // @@ -7,25 +7,41 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <utility> #define DEBUG_TYPE "simple-loop-unswitch" @@ -174,7 +190,7 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, // When the loop exit is directly unswitched we just need to update the // incoming basic block. We loop to handle weird cases with repeated // incoming blocks, but expect to typically only have one operand here. - for (auto i : llvm::seq<int>(0, PN->getNumOperands())) { + for (auto i : seq<int>(0, PN->getNumOperands())) { assert(PN->getIncomingBlock(i) == &OldExitingBB && "Found incoming block different from unique predecessor!"); PN->setIncomingBlock(i, &OldPH); @@ -688,9 +704,11 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, } namespace { + class SimpleLoopUnswitchLegacyPass : public LoopPass { public: static char ID; // Pass ID, replacement for typeid + explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) { initializeSimpleLoopUnswitchLegacyPassPass( *PassRegistry::getPassRegistry()); @@ -703,7 +721,8 @@ public: getLoopAnalysisUsage(AU); } }; -} // namespace + +} // end anonymous namespace bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipLoop(L)) |