aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-01-20 11:41:25 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-01-20 11:41:25 +0000
commitd9484dd61cc151c4f34c31e07f693fefa66316b5 (patch)
treeab0560b3da293f1fafd3269c59692e929418f5c2 /contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
parent79e0962d4c3cf1f0acf359a9d69cb3ac68c414c4 (diff)
parentd8e91e46262bc44006913e6796843909f1ac7bcd (diff)
downloadsrc-d9484dd61cc151c4f34c31e07f693fefa66316b5.tar.gz
src-d9484dd61cc151c4f34c31e07f693fefa66316b5.zip
Merge llvm trunk r351319, resolve conflicts, and update FREEBSD-Xlist.
Notes
Notes: svn path=/projects/clang800-import/; revision=343210
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp876
1 files changed, 589 insertions, 287 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 865462622627..36929a4f5439 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -290,11 +290,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
-
- { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
- { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
- { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
- { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -308,11 +303,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v2i64, 1 },
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
-
- { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
- { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
- { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
- { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -328,15 +318,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
-
- { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
- { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
- { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
- { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
- { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
- { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
- { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
- { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -354,7 +335,81 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
{ ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+ };
+
+ // XOP has faster vXi8 shifts.
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2() && !ST->hasXOP()) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512BWConstCostTable[] = {
+ { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasBWI()) {
+ if (const auto *Entry =
+ CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512ConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
+ { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasAVX512()) {
+ if (const auto *Entry =
+ CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ static const CostTblEntry AVX2ConstCostTable[] = {
+ { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2ConstCostTable[] = {
+ { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
+ { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
+ { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
{ ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
@@ -373,7 +428,8 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
};
- if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
ST->hasSSE2()) {
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
@@ -385,12 +441,8 @@ int X86TTIImpl::getArithmeticInstrCost(
if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 20;
- // XOP has faster vXi8 shifts.
- if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
- !ST->hasXOP())
- if (const auto *Entry =
- CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
- return LT.first * Entry->Cost;
+ if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
}
static const CostTblEntry AVX2UniformCostTable[] = {
@@ -560,9 +612,18 @@ int X86TTIImpl::getArithmeticInstrCost(
};
// Look for XOP lowering tricks.
- if (ST->hasXOP())
- if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
+ if (ST->hasXOP()) {
+ // If the right shift is constant then we'll fold the negation so
+ // it's as cheap as a left shift.
+ int ShiftISD = ISD;
+ if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ ShiftISD = ISD::SHL;
+ if (const auto *Entry =
+ CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
return LT.first * Entry->Cost;
+ }
static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
@@ -771,6 +832,12 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+
+ { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
+ { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
+ { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
};
if (ST->hasSSE2())
@@ -780,6 +847,20 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE1CostTable[] = {
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
+
+ { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
+
+ { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+
+ { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
};
if (ST->hasSSE1())
@@ -810,12 +891,30 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ // Treat Transpose as 2-op shuffles - there's no difference in lowering.
+ if (Kind == TTI::SK_Transpose)
+ Kind = TTI::SK_PermuteTwoSrc;
+
// For Broadcasts we are splatting the first element from the first input
// register, so only need to reference that input and all the output
// registers are the same.
if (Kind == TTI::SK_Broadcast)
LT.first = 1;
+ // Subvector extractions are free if they start at the beginning of a
+ // vector and cheap if the subvectors are aligned.
+ if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
+ int NumElts = LT.second.getVectorNumElements();
+ if ((Index % NumElts) == 0)
+ return 0;
+ std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
+ if (SubLT.second.isVector()) {
+ int NumSubElts = SubLT.second.getVectorNumElements();
+ if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+ return SubLT.first;
+ }
+ }
+
// We are going to permute multiple sources and the result will be in multiple
// destinations. Providing an accurate cost only for splits where the element
// type remains the same.
@@ -853,15 +952,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
- { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
+ {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
+ {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
- { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
- { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
- { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
- { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
};
if (ST->hasVBMI())
@@ -870,25 +969,25 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
+ {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
- { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
- { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
+ {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
+ {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
+ {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
- { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
- { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
- { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
- { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
};
if (ST->hasBWI())
@@ -897,42 +996,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry AVX512ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
- { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
- { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
- { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
-
- { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
- { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
- { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
- { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
-
- { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
- { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
- { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
- { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
- { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
- { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
- { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
- { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
- { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
- { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
- { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
-
- { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
- { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
- { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
- { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
- { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
- { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
- { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
- { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
- { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
- { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
- { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
- { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
+ {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
+ {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
+ {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+
+ {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
+ {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
+ {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
+ {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
+
+ {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
+
+ {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
};
if (ST->hasAVX512())
@@ -940,40 +1039,40 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry AVX2ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
- { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
- { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
- { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
- { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
-
- { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
- { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
- { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
- { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
- { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
-
- { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb
- { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb
-
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
- { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
- { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
+ {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
+ {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
+ {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
+ {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
+ {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
+ {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
+ {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
+ {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
+
+ {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
+ {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
- { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
- { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
- { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
- { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
- { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
- // + vpblendvb
- { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
- // + vpblendvb
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
};
if (ST->hasAVX2())
@@ -981,21 +1080,21 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry XOPShuffleTbl[] = {
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
- { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
- { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
- // + vinsertf128
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
- // + vinsertf128
-
- { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm
- // + vinsertf128
- { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
- { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm
- // + vinsertf128
- { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
+ // + vinsertf128
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
+ // + vinsertf128
+
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
+ // + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
+ // + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
};
if (ST->hasXOP())
@@ -1003,46 +1102,46 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry AVX1ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
- { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
- { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
- { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
- { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
- { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
-
- { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
- { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
- { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
- { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
- { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
- // + vinsertf128
- { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
- // + vinsertf128
-
- { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd
- { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd
- { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps
- { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps
- { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor
- { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor
-
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd
- { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
- { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
+ {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
+
+ {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+
+ {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
+ {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
+ {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
+ {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
+ {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
+ {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
- { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
- { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
- { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
- { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
- { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
- // + 4*por + vinsertf128
- { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb
- // + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
};
if (ST->hasAVX())
@@ -1050,12 +1149,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry SSE41ShuffleTbl[] = {
- { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw
- { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw
- { TTI::SK_Select, MVT::v4f32, 1 }, // blendps
- { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw
- { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb
+ {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
+ {TTI::SK_Select, MVT::v2f64, 1}, // movsd
+ {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
+ {TTI::SK_Select, MVT::v4f32, 1}, // blendps
+ {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
+ {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
};
if (ST->hasSSE41())
@@ -1063,20 +1162,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry SSSE3ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
- { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
+ {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
- { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
- { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
+ {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
- { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por
- { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por
+ {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
- { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
- { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
- { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
- { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
};
if (ST->hasSSSE3())
@@ -1084,29 +1183,29 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry SSE2ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
- { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
- { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
- { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
- { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
-
- { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
- { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
- { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
- { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
- { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
- // + 2*pshufd + 2*unpck + packus
-
- { TTI::SK_Select, MVT::v2i64, 1 }, // movsd
- { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps
- { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por
- { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por
-
- { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
- { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
- { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
- { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
+ {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
+ {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
+ {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
+ {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
+
+ {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
+ {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
+ {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
+ {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
+
+ {TTI::SK_Select, MVT::v2i64, 1}, // movsd
+ {TTI::SK_Select, MVT::v2f64, 1}, // movsd
+ {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
+ {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
+ {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
+
+ {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
+ {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
// + pshufd/unpck
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + 2*packus
@@ -1145,6 +1244,27 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// FIXME: Need a better design of the cost table to handle non-simple types of
// potential massive combinations (elem_num x src_type x dst_type).
+ static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
+
+ // Mask zero extend is a load + broadcast.
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
+ };
+
static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
@@ -1208,8 +1328,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
@@ -1231,12 +1349,16 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
+
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },
{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },
@@ -1328,13 +1450,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
// The generic code to compute the scalar overhead is currently broken.
// Workaround this limitation by estimating the scalarization overhead
// here. We have roughly 10 instructions per scalar element.
// Multiply that by the vector width.
// FIXME: remove that when PR19268 is fixed.
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
@@ -1387,6 +1509,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
};
static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
@@ -1408,11 +1531,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
+
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
@@ -1465,43 +1590,51 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (!SrcTy.isSimple() || !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src);
- if (ST->hasDQI())
- if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
- DstTy.getSimpleVT(),
- SrcTy.getSimpleVT()))
- return Entry->Cost;
+ MVT SimpleSrcTy = SrcTy.getSimpleVT();
+ MVT SimpleDstTy = DstTy.getSimpleVT();
- if (ST->hasAVX512())
- if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
- DstTy.getSimpleVT(),
- SrcTy.getSimpleVT()))
- return Entry->Cost;
+ // Make sure that neither type is going to be split before using the
+ // AVX512 tables. This handles -mprefer-vector-width=256
+ // with -min-legal-vector-width<=256
+ if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
+ TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return Entry->Cost;
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return Entry->Cost;
+ }
if (ST->hasAVX2()) {
if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
- DstTy.getSimpleVT(),
- SrcTy.getSimpleVT()))
+ SimpleDstTy, SimpleSrcTy))
return Entry->Cost;
}
if (ST->hasAVX()) {
if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
- DstTy.getSimpleVT(),
- SrcTy.getSimpleVT()))
+ SimpleDstTy, SimpleSrcTy))
return Entry->Cost;
}
if (ST->hasSSE41()) {
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
- DstTy.getSimpleVT(),
- SrcTy.getSimpleVT()))
+ SimpleDstTy, SimpleSrcTy))
return Entry->Cost;
}
if (ST->hasSSE2()) {
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
- DstTy.getSimpleVT(),
- SrcTy.getSimpleVT()))
+ SimpleDstTy, SimpleSrcTy))
return Entry->Cost;
}
@@ -1629,6 +1762,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTTZ, MVT::v16i32, 14 },
{ ISD::CTTZ, MVT::v32i16, 12 },
{ ISD::CTTZ, MVT::v64i8, 9 },
+ { ISD::SADDSAT, MVT::v32i16, 1 },
+ { ISD::SADDSAT, MVT::v64i8, 1 },
+ { ISD::SSUBSAT, MVT::v32i16, 1 },
+ { ISD::SSUBSAT, MVT::v64i8, 1 },
+ { ISD::UADDSAT, MVT::v32i16, 1 },
+ { ISD::UADDSAT, MVT::v64i8, 1 },
+ { ISD::USUBSAT, MVT::v32i16, 1 },
+ { ISD::USUBSAT, MVT::v64i8, 1 },
};
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },
@@ -1639,6 +1780,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTPOP, MVT::v16i32, 24 },
{ ISD::CTTZ, MVT::v8i64, 20 },
{ ISD::CTTZ, MVT::v16i32, 28 },
+ { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
+ { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
+ { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
+ { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -1674,6 +1819,15 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTTZ, MVT::v8i32, 14 },
{ ISD::CTTZ, MVT::v16i16, 12 },
{ ISD::CTTZ, MVT::v32i8, 9 },
+ { ISD::SADDSAT, MVT::v16i16, 1 },
+ { ISD::SADDSAT, MVT::v32i8, 1 },
+ { ISD::SSUBSAT, MVT::v16i16, 1 },
+ { ISD::SSUBSAT, MVT::v32i8, 1 },
+ { ISD::UADDSAT, MVT::v16i16, 1 },
+ { ISD::UADDSAT, MVT::v32i8, 1 },
+ { ISD::USUBSAT, MVT::v16i16, 1 },
+ { ISD::USUBSAT, MVT::v32i8, 1 },
+ { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -1701,6 +1855,15 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
{ ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
{ ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -1721,6 +1884,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
};
static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
@@ -1765,6 +1929,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTTZ, MVT::v4i32, 18 },
{ ISD::CTTZ, MVT::v8i16, 16 },
{ ISD::CTTZ, MVT::v16i8, 13 },
+ { ISD::SADDSAT, MVT::v8i16, 1 },
+ { ISD::SADDSAT, MVT::v16i8, 1 },
+ { ISD::SSUBSAT, MVT::v8i16, 1 },
+ { ISD::SSUBSAT, MVT::v16i8, 1 },
+ { ISD::UADDSAT, MVT::v8i16, 1 },
+ { ISD::UADDSAT, MVT::v16i8, 1 },
+ { ISD::USUBSAT, MVT::v8i16, 1 },
+ { ISD::USUBSAT, MVT::v16i8, 1 },
{ ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
};
@@ -1800,76 +1972,180 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::cttz:
ISD = ISD::CTTZ;
break;
+ case Intrinsic::sadd_sat:
+ ISD = ISD::SADDSAT;
+ break;
+ case Intrinsic::ssub_sat:
+ ISD = ISD::SSUBSAT;
+ break;
+ case Intrinsic::uadd_sat:
+ ISD = ISD::UADDSAT;
+ break;
+ case Intrinsic::usub_sat:
+ ISD = ISD::USUBSAT;
+ break;
case Intrinsic::sqrt:
ISD = ISD::FSQRT;
break;
}
- // Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
- MVT MTy = LT.second;
+ if (ISD != ISD::DELETED_NODE) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ MVT MTy = LT.second;
- // Attempt to lookup cost.
- if (ST->isGLM())
- if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ // Attempt to lookup cost.
+ if (ST->isGLM())
+ if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasCDI())
- if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasCDI())
+ if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasBWI())
- if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX512())
- if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasXOP())
- if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSSE3())
- if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->is64Bit())
- if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
}
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+ ArrayRef<Value *> Args, FastMathFlags FMF,
+ unsigned VF) {
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::ROTL, MVT::v8i64, 1 },
+ { ISD::ROTL, MVT::v4i64, 1 },
+ { ISD::ROTL, MVT::v2i64, 1 },
+ { ISD::ROTL, MVT::v16i32, 1 },
+ { ISD::ROTL, MVT::v8i32, 1 },
+ { ISD::ROTL, MVT::v4i32, 1 },
+ { ISD::ROTR, MVT::v8i64, 1 },
+ { ISD::ROTR, MVT::v4i64, 1 },
+ { ISD::ROTR, MVT::v2i64, 1 },
+ { ISD::ROTR, MVT::v16i32, 1 },
+ { ISD::ROTR, MVT::v8i32, 1 },
+ { ISD::ROTR, MVT::v4i32, 1 }
+ };
+ // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
+ static const CostTblEntry XOPCostTbl[] = {
+ { ISD::ROTL, MVT::v4i64, 4 },
+ { ISD::ROTL, MVT::v8i32, 4 },
+ { ISD::ROTL, MVT::v16i16, 4 },
+ { ISD::ROTL, MVT::v32i8, 4 },
+ { ISD::ROTL, MVT::v2i64, 1 },
+ { ISD::ROTL, MVT::v4i32, 1 },
+ { ISD::ROTL, MVT::v8i16, 1 },
+ { ISD::ROTL, MVT::v16i8, 1 },
+ { ISD::ROTR, MVT::v4i64, 6 },
+ { ISD::ROTR, MVT::v8i32, 6 },
+ { ISD::ROTR, MVT::v16i16, 6 },
+ { ISD::ROTR, MVT::v32i8, 6 },
+ { ISD::ROTR, MVT::v2i64, 2 },
+ { ISD::ROTR, MVT::v4i32, 2 },
+ { ISD::ROTR, MVT::v8i16, 2 },
+ { ISD::ROTR, MVT::v16i8, 2 }
+ };
+ static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::ROTL, MVT::i64, 1 },
+ { ISD::ROTR, MVT::i64, 1 },
+ { ISD::FSHL, MVT::i64, 4 }
+ };
+ static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::ROTL, MVT::i32, 1 },
+ { ISD::ROTL, MVT::i16, 1 },
+ { ISD::ROTL, MVT::i8, 1 },
+ { ISD::ROTR, MVT::i32, 1 },
+ { ISD::ROTR, MVT::i16, 1 },
+ { ISD::ROTR, MVT::i8, 1 },
+ { ISD::FSHL, MVT::i32, 4 },
+ { ISD::FSHL, MVT::i16, 4 },
+ { ISD::FSHL, MVT::i8, 4 }
+ };
+
+ unsigned ISD = ISD::DELETED_NODE;
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::fshl:
+ ISD = ISD::FSHL;
+ if (Args[0] == Args[1])
+ ISD = ISD::ROTL;
+ break;
+ case Intrinsic::fshr:
+ // FSHR has same costs so don't duplicate.
+ ISD = ISD::FSHL;
+ if (Args[0] == Args[1])
+ ISD = ISD::ROTR;
+ break;
+ }
+
+ if (ISD != ISD::DELETED_NODE) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ MVT MTy = LT.second;
+
+ // Attempt to lookup cost.
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
}
@@ -2341,11 +2617,15 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
return TTI::TCC_Free;
ImmIdx = 1;
break;
- case Instruction::Mul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
+ // Division by constant is typically expanded later into a different
+ // instruction sequence. This completely changes the constants.
+ // Report them as "free" to stop ConstantHoist from marking them as opaque.
+ return TTI::TCC_Free;
+ case Instruction::Mul:
case Instruction::Or:
case Instruction::Xor:
ImmIdx = 1;
@@ -2690,6 +2970,9 @@ X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);
+ // All GPR and vector loads can be unaligned. SIMD compare requires integer
+ // vectors (SSE2/AVX2).
+ Options.AllowOverlappingLoads = true;
return Options;
}();
return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
@@ -2718,7 +3001,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
+
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
// We currently Support only fully-interleaved groups, with no gaps.
// TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2827,7 +3117,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
+
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -2945,7 +3242,9 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
Type *EltTy = VecTy->getVectorElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -2957,11 +3256,14 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
};
if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
if (ST->hasAVX2())
return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
}