aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-12-02 21:02:54 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-12-02 21:02:54 +0000
commitf65dcba83ce5035ab88a85fe17628b447eb56e1b (patch)
tree35f37bb72b3cfc6060193e66c76ee7c9478969b0 /llvm/lib/Target/X86/X86TargetTransformInfo.cpp
parent846a2208a8ab099f595fe7e8b2e6d54a7b5e67fb (diff)
downloadsrc-f65dcba83ce5035ab88a85fe17628b447eb56e1b.tar.gz
src-f65dcba83ce5035ab88a85fe17628b447eb56e1b.zip
Vendor import of llvm-project main llvmorg-14-init-11187-g222442ec2d71.vendor/llvm-project/llvmorg-14-init-11187-g222442ec2d71
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp251
1 files changed, 178 insertions, 73 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 06dacb638d16..869762b35196 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1584,54 +1584,98 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
// Mask sign extend has an instruction.
- { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
// Mask zero extend is a sext + shift.
- { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
{ ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
- { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
- { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
- { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
- { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
};
static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
+
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
+
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
@@ -1786,40 +1830,94 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
// Mask sign extend has an instruction.
- { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
// Mask zero extend is a sext + shift.
- { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
- { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
- { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
- { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
- { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
- { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
};
static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
+
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
+
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
@@ -3674,6 +3772,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
PromEltTyBits = 16; // promote to i16, AVX512BW.
break;
}
+ if (ST->hasDQI()) {
+ PromEltTyBits = 32; // promote to i32, AVX512F.
+ break;
+ }
return bailout();
default:
return bailout();
@@ -3969,7 +4071,9 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
// Even in the case of (loop invariant) stride whose value is not known at
// compile time, the address computation will not incur more than one extra
// ADD instruction.
- if (Ty->isVectorTy() && SE) {
+ if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
+ // TODO: AVX2 is the current cut-off because we don't have correct
+ // interleaving costs for prior ISA's.
if (!BaseT::isStridedAccess(Ptr))
return NumVectorInstToHideOverhead;
if (!BaseT::getConstantStrideStep(SE, Ptr))
@@ -5173,7 +5277,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
LegalVT.getVectorNumElements());
InstructionCost MemOpCost;
- if (UseMaskForCond || UseMaskForGaps)
+ bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
+ if (UseMaskedMemOp)
MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
AddressSpace, CostKind);
else
@@ -5183,9 +5288,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
unsigned VF = VecTy->getNumElements() / Factor;
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
- // FIXME: this is the most conservative estimate for the mask cost.
InstructionCost MaskCost;
- if (UseMaskForCond || UseMaskForGaps) {
+ if (UseMaskedMemOp) {
APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");
@@ -5193,10 +5297,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
DemandedLoadStoreElts.setBit(Index + Elm * Factor);
}
- Type *I8Type = Type::getInt8Ty(VecTy->getContext());
+ Type *I1Type = Type::getInt1Ty(VecTy->getContext());
MaskCost = getReplicationShuffleCost(
- I8Type, Factor, VF,
+ I1Type, Factor, VF,
UseMaskForGaps ? DemandedLoadStoreElts
: APInt::getAllOnes(VecTy->getNumElements()),
CostKind);
@@ -5207,7 +5311,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
// memory access, we need to account for the cost of And-ing the two masks
// inside the loop.
if (UseMaskForGaps) {
- auto *MaskVT = FixedVectorType::get(I8Type, VecTy->getNumElements());
+ auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
}
}
@@ -5248,9 +5352,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
NumOfLoadsInInterleaveGrp;
// About a half of the loads may be folded in shuffles when we have only
- // one result. If we have more than one result, we do not fold loads at all.
+ // one result. If we have more than one result, or the loads are masked,
+ // we do not fold loads at all.
unsigned NumOfUnfoldedLoads =
- NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+ UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
// Get a number of shuffle operations per result.
unsigned NumOfShufflesPerResult =