aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp167
1 files changed, 114 insertions, 53 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index e05699cc95ec..53556ffc267d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -33,6 +33,10 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
cl::desc("Enable using coldcc calling conv for cold "
"internal functions"));
+static cl::opt<bool>
+LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
+ cl::desc("Do not add instruction count to lsr cost model"));
+
// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
static cl::opt<unsigned>
@@ -55,9 +59,10 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
if (DisablePPCConstHoist)
- return BaseT::getIntImmCost(Imm, Ty);
+ return BaseT::getIntImmCost(Imm, Ty, CostKind);
assert(Ty->isIntegerTy());
@@ -85,9 +90,10 @@ int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
}
int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
if (DisablePPCConstHoist)
- return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty);
+ return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
assert(Ty->isIntegerTy());
@@ -115,13 +121,14 @@ int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return TTI::TCC_Free;
break;
}
- return PPCTTIImpl::getIntImmCost(Imm, Ty);
+ return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
if (DisablePPCConstHoist)
- return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty);
+ return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind);
assert(Ty->isIntegerTy());
@@ -199,22 +206,28 @@ int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
return TTI::TCC_Free;
}
- return PPCTTIImpl::getIntImmCost(Imm, Ty);
+ return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-unsigned PPCTTIImpl::getUserCost(const User *U,
- ArrayRef<const Value *> Operands) {
+unsigned
+PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind) {
+ // We already implement getCastInstrCost and getMemoryOpCost where we perform
+ // the vector adjustment there.
+ if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
+ return BaseT::getUserCost(U, Operands, CostKind);
+
if (U->getType()->isVectorTy()) {
// Instructions that need to be split should cost more.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
- return LT.first * BaseT::getUserCost(U, Operands);
+ return LT.first * BaseT::getUserCost(U, Operands, CostKind);
}
- return BaseT::getUserCost(U, Operands);
+ return BaseT::getUserCost(U, Operands, CostKind);
}
-bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
- TargetLibraryInfo *LibInfo) {
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
+ SmallPtrSetImpl<const Value *> &Visited) {
const PPCTargetMachine &TM = ST->getTargetMachine();
// Loop through the inline asm constraints and look for something that
@@ -233,8 +246,11 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
// Determining the address of a TLS variable results in a function call in
// certain TLS models.
- std::function<bool(const Value*)> memAddrUsesCTR =
- [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+ std::function<bool(const Value *)> memAddrUsesCTR =
+ [&memAddrUsesCTR, &TM, &Visited](const Value *MemAddr) -> bool {
+ // No need to traverse again if we already checked this operand.
+ if (!Visited.insert(MemAddr).second)
+ return false;
const auto *GV = dyn_cast<GlobalValue>(MemAddr);
if (!GV) {
// Recurse to check for constants that refer to TLS global variables.
@@ -264,7 +280,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
J != JE; ++J) {
if (CallInst *CI = dyn_cast<CallInst>(J)) {
// Inline ASM is okay, unless it clobbers the ctr register.
- if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+ if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
if (asmClobbersCTR(IA))
return true;
continue;
@@ -277,7 +293,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
switch (F->getIntrinsicID()) {
default: continue;
- // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+ // If we have a call to loop_decrement or set_loop_iterations,
// we're definitely using CTR.
case Intrinsic::set_loop_iterations:
case Intrinsic::loop_decrement:
@@ -308,6 +324,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
return true;
else
continue; // ISD::FCOPYSIGN is never a library call.
+ case Intrinsic::fma: Opcode = ISD::FMA; break;
case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
@@ -412,8 +429,9 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
return true;
} else if (isa<BinaryOperator>(J) &&
- J->getType()->getScalarType()->isPPC_FP128Ty()) {
- // Most operations on ppc_f128 values become calls.
+ (J->getType()->getScalarType()->isFP128Ty() ||
+ J->getType()->getScalarType()->isPPC_FP128Ty())) {
+ // Most operations on f128 or ppc_f128 values become calls.
return true;
} else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
@@ -498,9 +516,10 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
// We don't want to spill/restore the counter register, and so we don't
// want to use the counter register if the loop contains calls.
+ SmallPtrSet<const Value *, 4> Visited;
for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
I != IE; ++I)
- if (mightUseCTR(*I, LibInfo))
+ if (mightUseCTR(*I, LibInfo, Visited))
return false;
SmallVector<BasicBlock*, 4> ExitingBlocks;
@@ -549,6 +568,10 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getUnrollingPreferences(L, SE, UP);
}
+void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ BaseT::getPeelingPreferences(L, SE, PP);
+}
// This function returns true to allow using coldcc calling convention.
// Returning true results in coldcc being used for functions which are cold at
// all call sites when the callers of the functions are not calling any other
@@ -637,11 +660,12 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
if (CacheLineSize.getNumOccurrences() > 0)
return CacheLineSize;
- // On P7, P8 or P9 we have a cache line size of 128.
+ // Starting with P7 we have a cache line size of 128.
unsigned Directive = ST->getCPUDirective();
// Assume that Future CPU has the same cache line size as the others.
if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
- Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+ Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+ Directive == PPC::DIR_PWR_FUTURE)
return 128;
// On other processors return a default of 64 bytes.
@@ -673,9 +697,11 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// For P7 and P8, floating-point instructions have a 6-cycle latency and
// there are two execution units, so unroll by 12x for latency hiding.
// FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+ // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
// Assume that future is the same as the others.
if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
- Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+ Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+ Directive == PPC::DIR_PWR_FUTURE)
return 12;
// For most things, modern systems have two execution units (and
@@ -711,6 +737,7 @@ int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
}
int PPCTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind,
TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info,
TTI::OperandValueProperties Opd1PropInfo,
@@ -718,9 +745,15 @@ int PPCTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
// Fallback to the default implementation.
- int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info,
Opd1PropInfo, Opd2PropInfo);
return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
}
@@ -739,17 +772,33 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
nullptr);
}
+int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Opcode == Instruction::PHI ? 0 : 1;
+ // Branches are assumed to be predicted.
+ return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+}
+
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
- int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
- return vectorCostAdjustment(Cost, Opcode, Dst, Src);
+ int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+ Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
+ // TODO: Allow non-throughput costs that aren't binary.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost == 0 ? 0 : 1;
+ return Cost;
}
int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
- int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost;
return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
}
@@ -828,13 +877,22 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Invalid Opcode");
- int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+ int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost;
+
Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
bool IsAltivecType = ST->hasAltivec() &&
@@ -856,7 +914,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// Aligned loads and stores are easy.
unsigned SrcBytes = LT.second.getStoreSize();
- if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+ if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
return Cost;
// If we can use the permutation-based load sequence, then this is also
@@ -868,7 +926,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// longer true.
if (Opcode == Instruction::Load &&
((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
- Alignment >= LT.second.getScalarType().getStoreSize())
+ *Alignment >= LT.second.getScalarType().getStoreSize())
return Cost + LT.first; // Add the cost of the permutations.
// For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
@@ -893,22 +951,20 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// stores, loads are expanded using the vector-load + permutation sequence,
// which is much less expensive).
if (Src->isVectorTy() && Opcode == Instruction::Store)
- for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+ for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
+ ++i)
Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
return Cost;
}
-int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int PPCTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
assert(isa<VectorType>(VecTy) &&
@@ -919,7 +975,8 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
// Firstly, the cost of load/store operation.
int Cost =
- getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace);
+ getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
+ CostKind);
// PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
// (at least in the sense that there need only be one non-loop-invariant
@@ -931,18 +988,9 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
return Cost;
}
-unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
- return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
-}
-
-unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type*> Tys, FastMathFlags FMF,
- unsigned ScalarizationCostPassed) {
- if (ID == Intrinsic::bswap && ST->hasP9Vector())
- return TLI->getTypeLegalizationCost(DL, RetTy).first;
- return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
- ScalarizationCostPassed);
+unsigned PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
@@ -967,3 +1015,16 @@ bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
*BI = HWLoopInfo.ExitBranch;
return true;
}
+
+bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+ TargetTransformInfo::LSRCost &C2) {
+ // PowerPC default behaviour here is "instruction number 1st priority".
+ // If LsrNoInsnsCost is set, call default implementation.
+ if (!LsrNoInsnsCost)
+ return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
+ C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+ std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
+ C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+ else
+ return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
+}