src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-21 18:13:02 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-21 18:13:02 +0000
commit	54db30ce18663e6c2991958f3b5d18362e8e93c4 (patch)
tree	4aa6442802570767398cc83ba484e97b1309bdc2 /contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
parent	35284c22e9c8348159b7ce032ea45f2cdeb65298 (diff)
parent	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)
download	src-54db30ce18663e6c2991958f3b5d18362e8e93c4.tar.gz src-54db30ce18663e6c2991958f3b5d18362e8e93c4.zip

Merge llvm trunk r366426, resolve conflicts, and update FREEBSD-Xlist.

Notes

Notes: svn path=/projects/clang900-import/; revision=351344

Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp')

-rw-r--r--

contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

529

1 files changed, 434 insertions, 95 deletions

diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 36929a4f5439..3dc59aeb263e 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

@@ -1,9 +1,8 @@

//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

/// \file

@@ -1651,17 +1650,77 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,

int ISD = TLI->InstructionOpcodeToISD(Opcode);

assert(ISD && "Invalid opcode");

- static const CostTblEntry SSE2CostTbl[] = {

- { ISD::SETCC, MVT::v2i64, 8 },

- { ISD::SETCC, MVT::v4i32, 1 },

- { ISD::SETCC, MVT::v8i16, 1 },

- { ISD::SETCC, MVT::v16i8, 1 },

+ unsigned ExtraCost = 0;

+ if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {

+ // Some vector comparison predicates cost extra instructions.

+ if (MTy.isVector() &&

+ !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||

+ (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||

+ ST->hasBWI())) {

+ switch (cast<CmpInst>(I)->getPredicate()) {

+ case CmpInst::Predicate::ICMP_NE:

+ // xor(cmpeq(x,y),-1)

+ ExtraCost = 1;

+ break;

+ case CmpInst::Predicate::ICMP_SGE:

+ case CmpInst::Predicate::ICMP_SLE:

+ // xor(cmpgt(x,y),-1)

+ ExtraCost = 1;

+ break;

+ case CmpInst::Predicate::ICMP_ULT:

+ case CmpInst::Predicate::ICMP_UGT:

+ // cmpgt(xor(x,signbit),xor(y,signbit))

+ // xor(cmpeq(pmaxu(x,y),x),-1)

+ ExtraCost = 2;

+ break;

+ case CmpInst::Predicate::ICMP_ULE:

+ case CmpInst::Predicate::ICMP_UGE:

+ if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||

+ (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {

+ // cmpeq(psubus(x,y),0)

+ // cmpeq(pminu(x,y),x)

+ ExtraCost = 1;

+ } else {

+ // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)

+ ExtraCost = 3;

+ }

+ break;

+ default:

+ break;

+ }

+ static const CostTblEntry AVX512BWCostTbl[] = {

+ { ISD::SETCC, MVT::v32i16, 1 },

+ { ISD::SETCC, MVT::v64i8, 1 },

+ { ISD::SELECT, MVT::v32i16, 1 },

+ { ISD::SELECT, MVT::v64i8, 1 },

};

- static const CostTblEntry SSE42CostTbl[] = {

- { ISD::SETCC, MVT::v2f64, 1 },

- { ISD::SETCC, MVT::v4f32, 1 },

- { ISD::SETCC, MVT::v2i64, 1 },

+ static const CostTblEntry AVX512CostTbl[] = {

+ { ISD::SETCC, MVT::v8i64, 1 },

+ { ISD::SETCC, MVT::v16i32, 1 },

+ { ISD::SETCC, MVT::v8f64, 1 },

+ { ISD::SETCC, MVT::v16f32, 1 },

+ { ISD::SELECT, MVT::v8i64, 1 },

+ { ISD::SELECT, MVT::v16i32, 1 },

+ { ISD::SELECT, MVT::v8f64, 1 },

+ { ISD::SELECT, MVT::v16f32, 1 },

+ };

+ static const CostTblEntry AVX2CostTbl[] = {

+ { ISD::SETCC, MVT::v4i64, 1 },

+ { ISD::SETCC, MVT::v8i32, 1 },

+ { ISD::SETCC, MVT::v16i16, 1 },

+ { ISD::SETCC, MVT::v32i8, 1 },

+ { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb

+ { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb

+ { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb

+ { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb

};

static const CostTblEntry AVX1CostTbl[] = {

@@ -1672,50 +1731,83 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,

{ ISD::SETCC, MVT::v8i32, 4 },

{ ISD::SETCC, MVT::v16i16, 4 },

{ ISD::SETCC, MVT::v32i8, 4 },

+ { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd

+ { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps

+ { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd

+ { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps

+ { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps

+ { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps

};

- static const CostTblEntry AVX2CostTbl[] = {

- { ISD::SETCC, MVT::v4i64, 1 },

- { ISD::SETCC, MVT::v8i32, 1 },

- { ISD::SETCC, MVT::v16i16, 1 },

- { ISD::SETCC, MVT::v32i8, 1 },

+ static const CostTblEntry SSE42CostTbl[] = {

+ { ISD::SETCC, MVT::v2f64, 1 },

+ { ISD::SETCC, MVT::v4f32, 1 },

+ { ISD::SETCC, MVT::v2i64, 1 },

};

- static const CostTblEntry AVX512CostTbl[] = {

- { ISD::SETCC, MVT::v8i64, 1 },

- { ISD::SETCC, MVT::v16i32, 1 },

- { ISD::SETCC, MVT::v8f64, 1 },

- { ISD::SETCC, MVT::v16f32, 1 },

+ static const CostTblEntry SSE41CostTbl[] = {

+ { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd

+ { ISD::SELECT, MVT::v4f32, 1 }, // blendvps

+ { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb

+ { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb

+ { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb

+ { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb

};

- static const CostTblEntry AVX512BWCostTbl[] = {

- { ISD::SETCC, MVT::v32i16, 1 },

- { ISD::SETCC, MVT::v64i8, 1 },

+ static const CostTblEntry SSE2CostTbl[] = {

+ { ISD::SETCC, MVT::v2f64, 2 },

+ { ISD::SETCC, MVT::f64, 1 },

+ { ISD::SETCC, MVT::v2i64, 8 },

+ { ISD::SETCC, MVT::v4i32, 1 },

+ { ISD::SETCC, MVT::v8i16, 1 },

+ { ISD::SETCC, MVT::v16i8, 1 },

+ { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd

+ { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por

+ { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por

+ { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por

+ { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por

+ };

+ static const CostTblEntry SSE1CostTbl[] = {

+ { ISD::SETCC, MVT::v4f32, 2 },

+ { ISD::SETCC, MVT::f32, 1 },

+ { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps

};

if (ST->hasBWI())

if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))

- return LT.first * Entry->Cost;

+ return LT.first * (ExtraCost + Entry->Cost);

if (ST->hasAVX512())

if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))

- return LT.first * Entry->Cost;

+ return LT.first * (ExtraCost + Entry->Cost);

if (ST->hasAVX2())

if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))

- return LT.first * Entry->Cost;

+ return LT.first * (ExtraCost + Entry->Cost);

if (ST->hasAVX())

if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))

- return LT.first * Entry->Cost;

+ return LT.first * (ExtraCost + Entry->Cost);

if (ST->hasSSE42())

if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))

- return LT.first * Entry->Cost;

+ return LT.first * (ExtraCost + Entry->Cost);

+ if (ST->hasSSE41())

+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))

+ return LT.first * (ExtraCost + Entry->Cost);

if (ST->hasSSE2())

if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))

- return LT.first * Entry->Cost;

+ return LT.first * (ExtraCost + Entry->Cost);

+ if (ST->hasSSE1())

+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))

+ return LT.first * (ExtraCost + Entry->Cost);

return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);

}

@@ -1784,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq

{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq

{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq

+ { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd

+ { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq

+ { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq

+ { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq

};

static const CostTblEntry XOPCostTbl[] = {

{ ISD::BITREVERSE, MVT::v4i64, 4 },

@@ -1825,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

{ ISD::SSUBSAT, MVT::v32i8, 1 },

{ ISD::UADDSAT, MVT::v16i16, 1 },

{ ISD::UADDSAT, MVT::v32i8, 1 },

+ { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd

{ ISD::USUBSAT, MVT::v16i16, 1 },

{ ISD::USUBSAT, MVT::v32i8, 1 },

{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd

@@ -1861,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert

{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert

{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert

+ { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert

{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert

{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert

{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert

@@ -1885,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

};

static const CostTblEntry SSE42CostTbl[] = {

{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd

+ { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd

{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/

{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/

};

@@ -1945,14 +2044,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/

};

static const CostTblEntry X64CostTbl[] = { // 64-bit targets

- { ISD::BITREVERSE, MVT::i64, 14 }

+ { ISD::BITREVERSE, MVT::i64, 14 },

+ { ISD::SADDO, MVT::i64, 1 },

+ { ISD::UADDO, MVT::i64, 1 },

};

static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets

{ ISD::BITREVERSE, MVT::i32, 14 },

{ ISD::BITREVERSE, MVT::i16, 14 },

- { ISD::BITREVERSE, MVT::i8, 11 }

+ { ISD::BITREVERSE, MVT::i8, 11 },

+ { ISD::SADDO, MVT::i32, 1 },

+ { ISD::SADDO, MVT::i16, 1 },

+ { ISD::SADDO, MVT::i8, 1 },

+ { ISD::UADDO, MVT::i32, 1 },

+ { ISD::UADDO, MVT::i16, 1 },

+ { ISD::UADDO, MVT::i8, 1 },

};

+ Type *OpTy = RetTy;

unsigned ISD = ISD::DELETED_NODE;

switch (IID) {

default:

@@ -1987,11 +2095,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

case Intrinsic::sqrt:

ISD = ISD::FSQRT;

break;

+ case Intrinsic::sadd_with_overflow:

+ case Intrinsic::ssub_with_overflow:

+ // SSUBO has same costs so don't duplicate.

+ ISD = ISD::SADDO;

+ OpTy = RetTy->getContainedType(0);

+ break;

+ case Intrinsic::uadd_with_overflow:

+ case Intrinsic::usub_with_overflow:

+ // USUBO has same costs so don't duplicate.

+ ISD = ISD::UADDO;

+ OpTy = RetTy->getContainedType(0);

+ break;

}

if (ISD != ISD::DELETED_NODE) {

// Legalize the type.

- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);

+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);

MVT MTy = LT.second;

// Attempt to lookup cost.

@@ -2226,6 +2346,9 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

unsigned Alignment,

unsigned AddressSpace) {

+ bool IsLoad = (Instruction::Load == Opcode);

+ bool IsStore = (Instruction::Store == Opcode);

VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);

if (!SrcVTy)

// To calculate scalar take the regular cost, without mask

@@ -2233,10 +2356,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

unsigned NumElem = SrcVTy->getVectorNumElements();

VectorType *MaskTy =

- VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);

- if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||

- (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||

- !isPowerOf2_32(NumElem)) {

+ VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);

+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||

+ (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {

// Scalarization

int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);

int ScalarCompareCost = getCmpSelInstrCost(

@@ -2244,8 +2366,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

int BranchCost = getCFInstrCost(Instruction::Br);

int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);

- int ValueSplitCost = getScalarizationOverhead(

- SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);

+ int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);

int MemopCost =

NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),

Alignment, AddressSpace);

@@ -2259,8 +2380,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

if (VT.isSimple() && LT.second != VT.getSimpleVT() &&

LT.second.getVectorNumElements() == NumElem)

// Promotion requires expand/truncate for data and a shuffle for mask.

- Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +

- getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);

+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +

+ getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);

else if (LT.second.getVectorNumElements() > NumElem) {

VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),

@@ -2268,11 +2389,13 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

// Expanding requires fill mask with zeroes

Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);

}

+ // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.

if (!ST->hasAVX512())

- return Cost + LT.first*4; // Each maskmov costs 4

+ return Cost + LT.first * (IsLoad ? 2 : 8);

// AVX-512 masked load/store is cheapper

- return Cost+LT.first;

+ return Cost + LT.first;

}

int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,

@@ -2281,7 +2404,7 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,

// likely result in more instructions compared to scalar code where the

// computation can more often be merged into the index mode. The resulting

// extra micro-ops can significantly decrease throughput.

- unsigned NumVectorInstToHideOverhead = 10;

+ const unsigned NumVectorInstToHideOverhead = 10;

// Cost modeling of Strided Access Computation is hidden by the indexing

// modes of X86 regardless of the stride value. We dont believe that there

@@ -2369,6 +2492,48 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

return LT.first * Entry->Cost;

}

+ static const CostTblEntry AVX2BoolReduction[] = {

+ { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp

+ { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp

+ { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp

+ { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp

+ };

+ static const CostTblEntry AVX1BoolReduction[] = {

+ { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp

+ { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp

+ { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp

+ { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp

+ { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp

+ { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp

+ { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp

+ { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp

+ };

+ static const CostTblEntry SSE2BoolReduction[] = {

+ { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp

+ { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp

+ { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp

+ { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp

+ { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp

+ { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp

+ { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp

+ { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp

+ };

+ // Handle bool allof/anyof patterns.

+ if (ValTy->getVectorElementType()->isIntegerTy(1)) {

+ if (ST->hasAVX2())

+ if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))

+ return LT.first * Entry->Cost;

+ if (ST->hasAVX())

+ if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))

+ return LT.first * Entry->Cost;

+ if (ST->hasSSE2())

+ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))

+ return LT.first * Entry->Cost;

+ }

return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);

}

@@ -2390,15 +2555,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,

// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput

// and make it as the cost.

- static const CostTblEntry SSE42CostTblPairWise[] = {

+ static const CostTblEntry SSE1CostTblPairWise[] = {

+ {ISD::FMINNUM, MVT::v4f32, 4},

+ };

+ static const CostTblEntry SSE2CostTblPairWise[] = {

{ISD::FMINNUM, MVT::v2f64, 3},

+ {ISD::SMIN, MVT::v2i64, 6},

+ {ISD::UMIN, MVT::v2i64, 8},

+ {ISD::SMIN, MVT::v4i32, 6},

+ {ISD::UMIN, MVT::v4i32, 8},

+ {ISD::SMIN, MVT::v8i16, 4},

+ {ISD::UMIN, MVT::v8i16, 6},

+ {ISD::SMIN, MVT::v16i8, 8},

+ {ISD::UMIN, MVT::v16i8, 6},

+ };

+ static const CostTblEntry SSE41CostTblPairWise[] = {

{ISD::FMINNUM, MVT::v4f32, 2},

- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"

- {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"

+ {ISD::SMIN, MVT::v2i64, 9},

+ {ISD::UMIN, MVT::v2i64,10},

{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"

{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"

{ISD::SMIN, MVT::v8i16, 2},

{ISD::UMIN, MVT::v8i16, 2},

+ {ISD::SMIN, MVT::v16i8, 3},

+ {ISD::UMIN, MVT::v16i8, 3},

+ };

+ static const CostTblEntry SSE42CostTblPairWise[] = {

+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"

+ {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"

};

static const CostTblEntry AVX1CostTblPairWise[] = {

@@ -2411,8 +2598,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,

{ISD::UMIN, MVT::v4i32, 1},

{ISD::SMIN, MVT::v8i16, 1},

{ISD::UMIN, MVT::v8i16, 1},

+ {ISD::SMIN, MVT::v16i8, 2},

+ {ISD::UMIN, MVT::v16i8, 2},

+ {ISD::SMIN, MVT::v4i64, 7},

+ {ISD::UMIN, MVT::v4i64, 7},

{ISD::SMIN, MVT::v8i32, 3},

{ISD::UMIN, MVT::v8i32, 3},

+ {ISD::SMIN, MVT::v16i16, 3},

+ {ISD::UMIN, MVT::v16i16, 3},

+ {ISD::SMIN, MVT::v32i8, 3},

+ {ISD::UMIN, MVT::v32i8, 3},

};

static const CostTblEntry AVX2CostTblPairWise[] = {

@@ -2435,15 +2630,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,

{ISD::UMIN, MVT::v16i32, 1},

};

- static const CostTblEntry SSE42CostTblNoPairWise[] = {

+ static const CostTblEntry SSE1CostTblNoPairWise[] = {

+ {ISD::FMINNUM, MVT::v4f32, 4},

+ };

+ static const CostTblEntry SSE2CostTblNoPairWise[] = {

{ISD::FMINNUM, MVT::v2f64, 3},

+ {ISD::SMIN, MVT::v2i64, 6},

+ {ISD::UMIN, MVT::v2i64, 8},

+ {ISD::SMIN, MVT::v4i32, 6},

+ {ISD::UMIN, MVT::v4i32, 8},

+ {ISD::SMIN, MVT::v8i16, 4},

+ {ISD::UMIN, MVT::v8i16, 6},

+ {ISD::SMIN, MVT::v16i8, 8},

+ {ISD::UMIN, MVT::v16i8, 6},

+ };

+ static const CostTblEntry SSE41CostTblNoPairWise[] = {

{ISD::FMINNUM, MVT::v4f32, 3},

- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"

- {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"

+ {ISD::SMIN, MVT::v2i64, 9},

+ {ISD::UMIN, MVT::v2i64,11},

{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"

{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"

{ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"

{ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"

+ {ISD::SMIN, MVT::v16i8, 3},

+ {ISD::UMIN, MVT::v16i8, 3},

+ };

+ static const CostTblEntry SSE42CostTblNoPairWise[] = {

+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"

+ {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"

};

static const CostTblEntry AVX1CostTblNoPairWise[] = {

@@ -2456,8 +2673,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,

{ISD::UMIN, MVT::v4i32, 1},

{ISD::SMIN, MVT::v8i16, 1},

{ISD::UMIN, MVT::v8i16, 1},

+ {ISD::SMIN, MVT::v16i8, 2},

+ {ISD::UMIN, MVT::v16i8, 2},

+ {ISD::SMIN, MVT::v4i64, 7},

+ {ISD::UMIN, MVT::v4i64, 7},

{ISD::SMIN, MVT::v8i32, 2},

{ISD::UMIN, MVT::v8i32, 2},

+ {ISD::SMIN, MVT::v16i16, 2},

+ {ISD::UMIN, MVT::v16i16, 2},

+ {ISD::SMIN, MVT::v32i8, 2},

+ {ISD::UMIN, MVT::v32i8, 2},

};

static const CostTblEntry AVX2CostTblNoPairWise[] = {

@@ -2496,6 +2721,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,

if (ST->hasSSE42())

if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))

return LT.first * Entry->Cost;

+ if (ST->hasSSE41())

+ if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

+ if (ST->hasSSE2())

+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

+ if (ST->hasSSE1())

+ if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

} else {

if (ST->hasAVX512())

if (const auto *Entry =

@@ -2513,6 +2750,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,

if (ST->hasSSE42())

if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))

return LT.first * Entry->Cost;

+ if (ST->hasSSE41())

+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

+ if (ST->hasSSE2())

+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

+ if (ST->hasSSE1())

+ if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

}

return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);

@@ -2864,26 +3113,106 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,

}

bool X86TTIImpl::canMacroFuseCmp() {

- return ST->hasMacroFusion();

+ return ST->hasMacroFusion() || ST->hasBranchFusion();

}

bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {

+ if (!ST->hasAVX())

+ return false;

// The backend can't handle a single element vector.

if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)

return false;

Type *ScalarTy = DataTy->getScalarType();

- int DataWidth = isa<PointerType>(ScalarTy) ?

- DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();

- return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||

- ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());

+ if (ScalarTy->isPointerTy())

+ return true;

+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())

+ return true;

+ if (!ScalarTy->isIntegerTy())

+ return false;

+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();

+ return IntWidth == 32 || IntWidth == 64 ||

+ ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());

}

bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {

return isLegalMaskedLoad(DataType);

}

+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {

+ unsigned DataSize = DL.getTypeStoreSize(DataType);

+ // The only supported nontemporal loads are for aligned vectors of 16 or 32

+ // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2

+ // (the equivalent stores only require AVX).

+ if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))

+ return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();

+ return false;

+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {

+ unsigned DataSize = DL.getTypeStoreSize(DataType);

+ // SSE4A supports nontemporal stores of float and double at arbitrary

+ // alignment.

+ if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))

+ return true;

+ // Besides the SSE4A subtarget exception above, only aligned stores are

+ // available nontemporaly on any other subtarget. And only stores with a size

+ // of 4..32 bytes (powers of 2, only) are permitted.

+ if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||

+ !isPowerOf2_32(DataSize))

+ return false;

+ // 32-byte vector nontemporal stores are supported by AVX (the equivalent

+ // loads require AVX2).

+ if (DataSize == 32)

+ return ST->hasAVX();

+ else if (DataSize == 16)

+ return ST->hasSSE1();

+ return true;

+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {

+ if (!isa<VectorType>(DataTy))

+ return false;

+ if (!ST->hasAVX512())

+ return false;

+ // The backend can't handle a single element vector.

+ if (DataTy->getVectorNumElements() == 1)

+ return false;

+ Type *ScalarTy = DataTy->getVectorElementType();

+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())

+ return true;

+ if (!ScalarTy->isIntegerTy())

+ return false;

+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();

+ return IntWidth == 32 || IntWidth == 64 ||

+ ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());

+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {

+ return isLegalMaskedExpandLoad(DataTy);

bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {

+ // Some CPUs have better gather performance than others.

+ // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only

+ // enable gather with a -march.

+ if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))

+ return false;

// This function is called now in two cases: from the Loop Vectorizer

// and from the Scalarizer.

// When the Loop Vectorizer asks about legality of the feature,

@@ -2902,14 +3231,17 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {

return false;

}

Type *ScalarTy = DataTy->getScalarType();

- int DataWidth = isa<PointerType>(ScalarTy) ?

- DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();

+ if (ScalarTy->isPointerTy())

+ return true;

- // Some CPUs have better gather performance than others.

- // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only

- // enable gather with a -march.

- return (DataWidth == 32 || DataWidth == 64) &&

- (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));

+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())

+ return true;

+ if (!ScalarTy->isIntegerTy())

+ return false;

+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();

+ return IntWidth == 32 || IntWidth == 64;

}

bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {

@@ -2938,44 +3270,51 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,

const FeatureBitset &CalleeBits =

TM.getSubtargetImpl(*Callee)->getFeatureBits();

- // FIXME: This is likely too limiting as it will include subtarget features

- // that we might not care about for inlining, but it is conservatively

- // correct.

- return (CallerBits & CalleeBits) == CalleeBits;

+ FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;

+ FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;

+ return (RealCallerBits & RealCalleeBits) == RealCalleeBits;

}

-const X86TTIImpl::TTI::MemCmpExpansionOptions *

-X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {

- // Only enable vector loads for equality comparison.

- // Right now the vector version is not as fast, see #33329.

- static const auto ThreeWayOptions = [this]() {

- TTI::MemCmpExpansionOptions Options;

- if (ST->is64Bit()) {

- Options.LoadSizes.push_back(8);

- }

- Options.LoadSizes.push_back(4);

- Options.LoadSizes.push_back(2);

- Options.LoadSizes.push_back(1);

- return Options;

- }();

- static const auto EqZeroOptions = [this]() {

- TTI::MemCmpExpansionOptions Options;

+bool X86TTIImpl::areFunctionArgsABICompatible(

+ const Function *Caller, const Function *Callee,

+ SmallPtrSetImpl<Argument *> &Args) const {

+ if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))

+ return false;

+ // If we get here, we know the target features match. If one function

+ // considers 512-bit vectors legal and the other does not, consider them

+ // incompatible.

+ // FIXME Look at the arguments and only consider 512 bit or larger vectors?

+ const TargetMachine &TM = getTLI()->getTargetMachine();

+ return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==

+ TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();

+X86TTIImpl::TTI::MemCmpExpansionOptions

+X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {

+ TTI::MemCmpExpansionOptions Options;

+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

+ Options.NumLoadsPerBlock = 2;

+ if (IsZeroCmp) {

+ // Only enable vector loads for equality comparison. Right now the vector

+ // version is not as fast for three way compare (see #33329).

// TODO: enable AVX512 when the DAG is ready.

// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);

- if (ST->hasAVX2()) Options.LoadSizes.push_back(32);

- if (ST->hasSSE2()) Options.LoadSizes.push_back(16);

- if (ST->is64Bit()) {

- Options.LoadSizes.push_back(8);

- }

- Options.LoadSizes.push_back(4);

- Options.LoadSizes.push_back(2);

- Options.LoadSizes.push_back(1);

+ const unsigned PreferredWidth = ST->getPreferVectorWidth();

+ if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);

+ if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);

// All GPR and vector loads can be unaligned. SIMD compare requires integer

// vectors (SSE2/AVX2).

Options.AllowOverlappingLoads = true;

- return Options;

- }();

- return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;

+ }

+ if (ST->is64Bit()) {

+ Options.LoadSizes.push_back(8);

+ }

+ Options.LoadSizes.push_back(4);

+ Options.LoadSizes.push_back(2);

+ Options.LoadSizes.push_back(1);

+ return Options;

}

bool X86TTIImpl::enableInterleavedAccessVectorization() {