src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2014-11-24 09:08:18 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2014-11-24 09:08:18 +0000
commit	5ca98fd98791947eba83a1ed3f2c8191ef7afa6c (patch)
tree	f5944309621cee4fe0976be6f9ac619b7ebfc4c2 /lib/CodeGen/BasicTargetTransformInfo.cpp
parent	68bcb7db193e4bc81430063148253d30a791023e (diff)
download	src-5ca98fd98791947eba83a1ed3f2c8191ef7afa6c.tar.gz src-5ca98fd98791947eba83a1ed3f2c8191ef7afa6c.zip

Vendor import of llvm RELEASE_350/final tag r216957 (effectively, 3.5.0 release):vendor/llvm/llvm-release_350-r216957

https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_350/final@216957

Notes

Notes: svn path=/vendor/llvm/dist/; revision=274955 svn path=/vendor/llvm/llvm-release_35-r216957/; revision=274956; tag=vendor/llvm/llvm-release_350-r216957

Diffstat (limited to 'lib/CodeGen/BasicTargetTransformInfo.cpp')

-rw-r--r--

lib/CodeGen/BasicTargetTransformInfo.cpp

225

1 files changed, 169 insertions, 56 deletions

diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index 24aa1abffa5b..b2737bf754f9 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp

@@ -15,27 +15,37 @@

///

//===----------------------------------------------------------------------===//

-#define DEBUG_TYPE "basictti"

#include "llvm/CodeGen/Passes.h"

+#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

+#include "llvm/Support/CommandLine.h"

#include "llvm/Target/TargetLowering.h"

+#include "llvm/Target/TargetSubtargetInfo.h"

#include <utility>

using namespace llvm;

+static cl::opt<unsigned>

+PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),

+ cl::desc("Threshold for partial unrolling"), cl::Hidden);

+#define DEBUG_TYPE "basictti"

namespace {

-class BasicTTI : public ImmutablePass, public TargetTransformInfo {

+class BasicTTI final : public ImmutablePass, public TargetTransformInfo {

const TargetMachine *TM;

/// Estimate the overhead of scalarizing an instruction. Insert and Extract

/// are set if the result needs to be inserted and/or extracted from vectors.

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;

+ /// Estimate the cost overhead of SK_Alternate shuffle.

+ unsigned getAltShuffleOverhead(Type *Ty) const;

const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }

public:

- BasicTTI() : ImmutablePass(ID), TM(0) {

+ BasicTTI() : ImmutablePass(ID), TM(nullptr) {

llvm_unreachable("This pass cannot be directly constructed");

}

@@ -43,15 +53,11 @@ public:

initializeBasicTTIPass(*PassRegistry::getPassRegistry());

}

- virtual void initializePass() {

+ void initializePass() override {

pushTTIStack(this);

}

- virtual void finalizePass() {

- popTTIStack();

- }

- virtual void getAnalysisUsage(AnalysisUsage &AU) const {

+ void getAnalysisUsage(AnalysisUsage &AU) const override {

TargetTransformInfo::getAnalysisUsage(AU);

}

@@ -59,61 +65,61 @@ public:

static char ID;

/// Provide necessary pointer adjustments for the two base classes.

- virtual void *getAdjustedAnalysisPointer(const void *ID) {

+ void *getAdjustedAnalysisPointer(const void *ID) override {

if (ID == &TargetTransformInfo::ID)

return (TargetTransformInfo*)this;

return this;

}

- virtual bool hasBranchDivergence() const;

+ bool hasBranchDivergence() const override;

/// \name Scalar TTI Implementations

/// @{

- virtual bool isLegalAddImmediate(int64_t imm) const;

- virtual bool isLegalICmpImmediate(int64_t imm) const;

- virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,

- int64_t BaseOffset, bool HasBaseReg,

- int64_t Scale) const;

- virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,

- int64_t BaseOffset, bool HasBaseReg,

- int64_t Scale) const;

- virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;

- virtual bool isTypeLegal(Type *Ty) const;

- virtual unsigned getJumpBufAlignment() const;

- virtual unsigned getJumpBufSize() const;

- virtual bool shouldBuildLookupTables() const;

- virtual bool haveFastSqrt(Type *Ty) const;

- virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;

+ bool isLegalAddImmediate(int64_t imm) const override;

+ bool isLegalICmpImmediate(int64_t imm) const override;

+ bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,

+ int64_t BaseOffset, bool HasBaseReg,

+ int64_t Scale) const override;

+ int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,

+ int64_t BaseOffset, bool HasBaseReg,

+ int64_t Scale) const override;

+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;

+ bool isTypeLegal(Type *Ty) const override;

+ unsigned getJumpBufAlignment() const override;

+ unsigned getJumpBufSize() const override;

+ bool shouldBuildLookupTables() const override;

+ bool haveFastSqrt(Type *Ty) const override;

+ void getUnrollingPreferences(Loop *L,

+ UnrollingPreferences &UP) const override;

/// @}

/// \name Vector TTI Implementations

/// @{

- virtual unsigned getNumberOfRegisters(bool Vector) const;

- virtual unsigned getMaximumUnrollFactor() const;

- virtual unsigned getRegisterBitWidth(bool Vector) const;

- virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,

- OperandValueKind,

- OperandValueKind) const;

- virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,

- int Index, Type *SubTp) const;

- virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,

- Type *Src) const;

- virtual unsigned getCFInstrCost(unsigned Opcode) const;

- virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,

- Type *CondTy) const;

- virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,

- unsigned Index) const;

- virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,

- unsigned Alignment,

- unsigned AddressSpace) const;

- virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,

- ArrayRef<Type*> Tys) const;

- virtual unsigned getNumberOfParts(Type *Tp) const;

- virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;

- virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) const;

+ unsigned getNumberOfRegisters(bool Vector) const override;

+ unsigned getMaximumUnrollFactor() const override;

+ unsigned getRegisterBitWidth(bool Vector) const override;

+ unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,

+ OperandValueKind) const override;

+ unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,

+ int Index, Type *SubTp) const override;

+ unsigned getCastInstrCost(unsigned Opcode, Type *Dst,

+ Type *Src) const override;

+ unsigned getCFInstrCost(unsigned Opcode) const override;

+ unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,

+ Type *CondTy) const override;

+ unsigned getVectorInstrCost(unsigned Opcode, Type *Val,

+ unsigned Index) const override;

+ unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

+ unsigned AddressSpace) const override;

+ unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,

+ ArrayRef<Type*> Tys) const override;

+ unsigned getNumberOfParts(Type *Tp) const override;

+ unsigned getAddressComputationCost( Type *Ty, bool IsComplex) const override;

+ unsigned getReductionCost(unsigned Opcode, Type *Ty,

+ bool IsPairwise) const override;

/// @}

};

@@ -191,7 +197,61 @@ bool BasicTTI::haveFastSqrt(Type *Ty) const {

return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);

}

-void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }

+void BasicTTI::getUnrollingPreferences(Loop *L,

+ UnrollingPreferences &UP) const {

+ // This unrolling functionality is target independent, but to provide some

+ // motivation for its intended use, for x86:

+ // According to the Intel 64 and IA-32 Architectures Optimization Reference

+ // Manual, Intel Core models and later have a loop stream detector

+ // (and associated uop queue) that can benefit from partial unrolling.

+ // The relevant requirements are:

+ // - The loop must have no more than 4 (8 for Nehalem and later) branches

+ // taken, and none of them may be calls.

+ // - The loop can have no more than 18 (28 for Nehalem and later) uops.

+ // According to the Software Optimization Guide for AMD Family 15h Processors,

+ // models 30h-4fh (Steamroller and later) have a loop predictor and loop

+ // buffer which can benefit from partial unrolling.

+ // The relevant requirements are:

+ // - The loop must have fewer than 16 branches

+ // - The loop must have less than 40 uops in all executed loop branches

+ // The number of taken branches in a loop is hard to estimate here, and

+ // benchmarking has revealed that it is better not to be conservative when

+ // estimating the branch count. As a result, we'll ignore the branch limits

+ // until someone finds a case where it matters in practice.

+ unsigned MaxOps;

+ const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();

+ if (PartialUnrollingThreshold.getNumOccurrences() > 0)

+ MaxOps = PartialUnrollingThreshold;

+ else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)

+ MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;

+ else

+ return;

+ // Scan the loop: don't unroll loops with calls.

+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();

+ I != E; ++I) {

+ BasicBlock *BB = *I;

+ for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)

+ if (isa<CallInst>(J) || isa<InvokeInst>(J)) {

+ ImmutableCallSite CS(J);

+ if (const Function *F = CS.getCalledFunction()) {

+ if (!TopTTI->isLoweredToCall(F))

+ continue;

+ }

+ return;

+ }

+ // Enable runtime and partial unrolling up to the specified size.

+ UP.Partial = UP.Runtime = true;

+ UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;

//===----------------------------------------------------------------------===//

@@ -270,8 +330,28 @@ unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

return OpCost;

}

+unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {

+ assert(Ty->isVectorTy() && "Can only shuffle vectors");

+ unsigned Cost = 0;

+ // Shuffle cost is equal to the cost of extracting element from its argument

+ // plus the cost of inserting them onto the result vector.

+ // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index

+ // 0 of first vector, index 1 of second vector,index 2 of first vector and

+ // finally index 3 of second vector and insert them at index <0,1,2,3> of

+ // result vector.

+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {

+ Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);

+ Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);

+ }

+ return Cost;

unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,

Type *SubTp) const {

+ if (Kind == SK_Alternate) {

+ return getAltShuffleOverhead(Tp);

+ }

return 1;

}

@@ -302,7 +382,8 @@ unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,

return 0;

// If the cast is marked as legal (or promote) then assume low cost.

- if (TLI->isOperationLegalOrPromote(ISD, DstLT.second))

+ if (SrcLT.first == DstLT.first &&

+ TLI->isOperationLegalOrPromote(ISD, DstLT.second))

return 1;

// Handle scalar conversions.

@@ -409,7 +490,9 @@ unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,

unsigned BasicTTI::getVectorInstrCost(unsigned Opcode, Type *Val,

unsigned Index) const {

- return 1;

+ std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Val->getScalarType());

+ return LT.first;

}

unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,

@@ -418,8 +501,32 @@ unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,

assert(!Src->isVoidTy() && "Invalid type");

std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Src);

- // Assume that all loads of legal types cost 1.

- return LT.first;

+ // Assuming that all loads of legal types cost 1.

+ unsigned Cost = LT.first;

+ if (Src->isVectorTy() &&

+ Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {

+ // This is a vector load that legalizes to a larger type than the vector

+ // itself. Unless the corresponding extending load or truncating store is

+ // legal, then this will scalarize.

+ TargetLowering::LegalizeAction LA = TargetLowering::Expand;

+ EVT MemVT = getTLI()->getValueType(Src, true);

+ if (MemVT.isSimple() && MemVT != MVT::Other) {

+ if (Opcode == Instruction::Store)

+ LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());

+ else

+ LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT.getSimpleVT());

+ }

+ if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {

+ // This is a vector load/store for some illegal type that is scalarized.

+ // We must account for the cost of building or decomposing the vector.

+ Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,

+ Opcode == Instruction::Store);

+ }

+ return Cost;

}

unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

@@ -464,7 +571,7 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

case Intrinsic::round: ISD = ISD::FROUND; break;

case Intrinsic::pow: ISD = ISD::FPOW; break;

case Intrinsic::fma: ISD = ISD::FMA; break;

- case Intrinsic::fmuladd: ISD = ISD::FMA; break; // FIXME: mul + add?

+ case Intrinsic::fmuladd: ISD = ISD::FMA; break;

case Intrinsic::lifetime_start:

case Intrinsic::lifetime_end:

return 0;

@@ -489,6 +596,12 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

return LT.first * 2;

}

+ // If we can't lower fmuladd into an FMA estimate the cost as a floating

+ // point mul followed by an add.

+ if (IID == Intrinsic::fmuladd)

+ return TopTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +

+ TopTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);

// Else, assume that we need to scalarize this intrinsic. For math builtins

// this will emit a costly libcall, adding call overhead and spills. Make it

// very expensive.