diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-21 18:13:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-21 18:13:02 +0000 |
commit | 54db30ce18663e6c2991958f3b5d18362e8e93c4 (patch) | |
tree | 4aa6442802570767398cc83ba484e97b1309bdc2 /contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | |
parent | 35284c22e9c8348159b7ce032ea45f2cdeb65298 (diff) | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) |
Merge llvm trunk r366426, resolve conflicts, and update FREEBSD-Xlist.
Notes
Notes:
svn path=/projects/clang900-import/; revision=351344
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 449 |
1 files changed, 423 insertions, 26 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index bc9bcab83a0a..ff3dfbfaca05 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1,17 +1,18 @@ //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "PPCTargetTransformInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -32,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions")); +// The latency of mtctr is only justified if there are more than 4 +// comparisons that will be removed as a result. +static cl::opt<unsigned> +SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, + cl::desc("Loops with a constant trip count smaller than " + "this value will not use the count register.")); + //===----------------------------------------------------------------------===// // // PPC cost model. @@ -205,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U, return BaseT::getUserCost(U, Operands); } +bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, + TargetLibraryInfo *LibInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + + // Loop through the inline asm constraints and look for something that + // clobbers ctr. + auto asmClobbersCTR = [](InlineAsm *IA) { + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + return false; + }; + + // Determining the address of a TLS variable results in a function call in + // certain TLS models. + std::function<bool(const Value*)> memAddrUsesCTR = + [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool { + const auto *GV = dyn_cast<GlobalValue>(MemAddr); + if (!GV) { + // Recurse to check for constants that refer to TLS global variables. + if (const auto *CV = dyn_cast<Constant>(MemAddr)) + for (const auto &CO : CV->operands()) + if (memAddrUsesCTR(CO)) + return true; + + return false; + } + + if (!GV->isThreadLocal()) + return false; + TLSModel::Model Model = TM.getTLSModel(GV); + return Model == TLSModel::GeneralDynamic || + Model == TLSModel::LocalDynamic; + }; + + auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) { + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); + + return false; + }; + + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); + J != JE; ++J) { + if (CallInst *CI = dyn_cast<CallInst>(J)) { + // Inline ASM is okay, unless it clobbers the ctr register. + if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { + if (asmClobbersCTR(IA)) + return true; + continue; + } + + if (Function *F = CI->getCalledFunction()) { + // Most intrinsics don't become function calls, but some might. + // sin, cos, exp and log are always calls. + unsigned Opcode = 0; + if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { + switch (F->getIntrinsicID()) { + default: continue; + // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr + // we're definitely using CTR. + case Intrinsic::set_loop_iterations: + case Intrinsic::loop_decrement: + return true; + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + case Intrinsic::setjmp: + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + case Intrinsic::longjmp: + + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp + // because, although it does clobber the counter register, the + // control can't then return to inside the loop unless there is also + // an eh_sjlj_setjmp. + case Intrinsic::eh_sjlj_setjmp: + + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::powi: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + case Intrinsic::sin: + case Intrinsic::cos: + return true; + case Intrinsic::copysign: + if (CI->getArgOperand(0)->getType()->getScalarType()-> + isPPC_FP128Ty()) + return true; + else + continue; // ISD::FCOPYSIGN is never a library call. + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; + case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; + case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; + } + } + + // PowerPC does not use [US]DIVREM or other library calls for + // operations on regular types which are not otherwise library calls + // (i.e. soft float or atomics). If adapting for targets that do, + // additional care is required here. + + LibFunc Func; + if (!F->hasLocalLinkage() && F->hasName() && LibInfo && + LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->hasOptimizedCodeGen(Func)) { + // Non-read-only functions are never treated as intrinsics. + if (!CI->onlyReadsMemory()) + return true; + + // Conversion happens only for FP calls. + if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) + return true; + + switch (Func) { + default: return true; + case LibFunc_copysign: + case LibFunc_copysignf: + continue; // ISD::FCOPYSIGN is never a library call. + case LibFunc_copysignl: + return true; + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: + continue; // ISD::FABS is never a library call. + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + Opcode = ISD::FSQRT; break; + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: + Opcode = ISD::FFLOOR; break; + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: + Opcode = ISD::FNEARBYINT; break; + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: + Opcode = ISD::FCEIL; break; + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: + Opcode = ISD::FRINT; break; + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: + Opcode = ISD::FROUND; break; + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: + Opcode = ISD::FTRUNC; break; + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: + Opcode = ISD::FMINNUM; break; + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: + Opcode = ISD::FMAXNUM; break; + } + } + + if (Opcode) { + EVT EVTy = + TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true); + + if (EVTy == MVT::Other) + return true; + + if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) + continue; + else if (EVTy.isVector() && + TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) + continue; + + return true; + } + } + + return true; + } else if (isa<BinaryOperator>(J) && + J->getType()->getScalarType()->isPPC_FP128Ty()) { + // Most operations on ppc_f128 values become calls. + return true; + } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) || + isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) { + CastInst *CI = cast<CastInst>(J); + if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || + CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || + isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType())) + return true; + } else if (isLargeIntegerTy(!TM.isPPC64(), + J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::UDiv || + J->getOpcode() == Instruction::SDiv || + J->getOpcode() == Instruction::URem || + J->getOpcode() == Instruction::SRem)) { + return true; + } else if (!TM.isPPC64() && + isLargeIntegerTy(false, J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::Shl || + J->getOpcode() == Instruction::AShr || + J->getOpcode() == Instruction::LShr)) { + // Only on PPC32, for 128-bit integers (specifically not 64-bit + // integers), these might be runtime calls. + return true; + } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) { + // On PowerPC, indirect jumps use the counter register. + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { + if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) + return true; + } + + // FREM is always a call. + if (J->getOpcode() == Instruction::FRem) + return true; + + if (ST->useSoftFloat()) { + switch(J->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FCmp: + return true; + } + } + + for (Value *Operand : J->operands()) + if (memAddrUsesCTR(Operand)) + return true; + } + + return false; +} + +bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + TargetSchedModel SchedModel; + SchedModel.init(ST); + + // Do not convert small short loops to CTR loop. + unsigned ConstTripCount = SE.getSmallConstantTripCount(L); + if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + CodeMetrics Metrics; + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, *this, EphValues); + // 6 is an approximate latency for the mtctr instruction. + if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) + return false; + } + + // We don't want to spill/restore the counter register, and so we don't + // want to use the counter register if the loop contains calls. + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) + if (mightUseCTR(*I, LibInfo)) + return false; + + SmallVector<BasicBlock*, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // If there is an exit edge known to be frequently taken, + // we should not transform this loop. + for (auto &BB : ExitingBlocks) { + Instruction *TI = BB->getTerminator(); + if (!TI) continue; + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + uint64_t TrueWeight = 0, FalseWeight = 0; + if (!BI->isConditional() || + !BI->extractProfMetadata(TrueWeight, FalseWeight)) + continue; + + // If the exit path is more frequent than the loop path, + // we return here without further analysis for this loop. + bool TrueIsExit = !L->contains(BI->getSuccessor(0)); + if (( TrueIsExit && FalseWeight < TrueWeight) || + (!TrueIsExit && FalseWeight > TrueWeight)) + return false; + } + } + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CountType = TM.isPPC64() ? + Type::getInt64Ty(C) : Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { @@ -239,17 +582,12 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } -const PPCTTIImpl::TTI::MemCmpExpansionOptions * -PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { - static const auto Options = []() { - TTI::MemCmpExpansionOptions Options; - Options.LoadSizes.push_back(8); - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); - return Options; - }(); - return &Options; +PPCTTIImpl::TTI::MemCmpExpansionOptions +PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes = {8, 4, 2, 1}; + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + return Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { @@ -324,6 +662,33 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } +// Adjust the cost of vector instructions on targets which there is overlap +// between the vector and scalar units, thereby reducing the overall throughput +// of vector code wrt. scalar code. +int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, + Type *Ty2) { + if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy()) + return Cost; + + std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1); + // If type legalization involves splitting the vector, we don't want to + // double the cost at every step - only the last step. + if (LT1.first != 1 || !LT1.second.isVector()) + return Cost; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (TLI->isOperationExpand(ISD, LT1.second)) + return Cost; + + if (Ty2) { + std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2); + if (LT2.first != 1 || !LT2.second.isVector()) + return Cost; + } + + return Cost * 2; +} + int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, @@ -331,8 +696,9 @@ int PPCTTIImpl::getArithmeticInstrCost( assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. - return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + return vectorCostAdjustment(Cost, Opcode, Ty, nullptr); } int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -345,19 +711,22 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the // structured types of shuffles covered by TTI::ShuffleKind). - return LT.first; + return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp, + nullptr); } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); - return BaseT::getCastInstrCost(Opcode, Dst, Src); + int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src); + return vectorCostAdjustment(Cost, Opcode, Dst, Src); } int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr); } int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -366,18 +735,23 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); + Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr); + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { - // Double-precision scalars are already located in index #0. - if (Index == 0) + // Double-precision scalars are already located in index #0 (or #1 if LE). + if (ISD == ISD::EXTRACT_VECTOR_ELT && + Index == (ST->isLittleEndian() ? 1 : 0)) return 0; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; + } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { // Floating point scalars are already located in index #0. if (Index == 0) return 0; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; } // Estimated cost of a load-hit-store delay. This was obtained @@ -394,9 +768,9 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { // these need to be estimated as very costly. if (ISD == ISD::EXTRACT_VECTOR_ELT || ISD == ISD::INSERT_VECTOR_ELT) - return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); + return LHSPenalty + Cost; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; } int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, @@ -407,6 +781,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, "Invalid Opcode"); int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr); bool IsAltivecType = ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || @@ -500,3 +875,25 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, return Cost; } +bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, TargetLibraryInfo *LibInfo) { + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + if (canSaveCmp(*I, BI, SE, LI, DT, AC, LibInfo)) + return false; // Stop search. + + HardwareLoopInfo HWLoopInfo(L); + + if (!HWLoopInfo.canAnalyze(*LI)) + return false; + + if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) + return false; + + if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) + return false; + + *BI = HWLoopInfo.ExitBranch; + return true; +} |