diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:02:54 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:02:54 +0000 |
commit | f65dcba83ce5035ab88a85fe17628b447eb56e1b (patch) | |
tree | 35f37bb72b3cfc6060193e66c76ee7c9478969b0 /llvm/lib/Target/X86/X86TargetTransformInfo.cpp | |
parent | 846a2208a8ab099f595fe7e8b2e6d54a7b5e67fb (diff) | |
download | src-f65dcba83ce5035ab88a85fe17628b447eb56e1b.tar.gz src-f65dcba83ce5035ab88a85fe17628b447eb56e1b.zip |
Vendor import of llvm-project main llvmorg-14-init-11187-g222442ec2d71.vendor/llvm-project/llvmorg-14-init-11187-g222442ec2d71
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 251 |
1 files changed, 178 insertions, 73 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 06dacb638d16..869762b35196 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1584,54 +1584,98 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, // Mask sign extend has an instruction. - { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, // Mask zero extend is a sext + shift. - { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, + + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, - { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, - { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, }; static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, + + // Mask zero extend is a sext + shift. + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, @@ -1786,40 +1830,94 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { // Mask sign extend has an instruction. - { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, // Mask zero extend is a sext + shift. - { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, + + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb }; static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { + // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, + + // Mask zero extend is a sext + shift. + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, + + { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, @@ -3674,6 +3772,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, PromEltTyBits = 16; // promote to i16, AVX512BW. break; } + if (ST->hasDQI()) { + PromEltTyBits = 32; // promote to i32, AVX512F. + break; + } return bailout(); default: return bailout(); @@ -3969,7 +4071,9 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, // Even in the case of (loop invariant) stride whose value is not known at // compile time, the address computation will not incur more than one extra // ADD instruction. - if (Ty->isVectorTy() && SE) { + if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { + // TODO: AVX2 is the current cut-off because we don't have correct + // interleaving costs for prior ISA's. if (!BaseT::isStridedAccess(Ptr)) return NumVectorInstToHideOverhead; if (!BaseT::getConstantStrideStep(SE, Ptr)) @@ -5173,7 +5277,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), LegalVT.getVectorNumElements()); InstructionCost MemOpCost; - if (UseMaskForCond || UseMaskForGaps) + bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; + if (UseMaskedMemOp) MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); else @@ -5183,9 +5288,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( unsigned VF = VecTy->getNumElements() / Factor; MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); - // FIXME: this is the most conservative estimate for the mask cost. InstructionCost MaskCost; - if (UseMaskForCond || UseMaskForGaps) { + if (UseMaskedMemOp) { APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); @@ -5193,10 +5297,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( DemandedLoadStoreElts.setBit(Index + Elm * Factor); } - Type *I8Type = Type::getInt8Ty(VecTy->getContext()); + Type *I1Type = Type::getInt1Ty(VecTy->getContext()); MaskCost = getReplicationShuffleCost( - I8Type, Factor, VF, + I1Type, Factor, VF, UseMaskForGaps ? DemandedLoadStoreElts : APInt::getAllOnes(VecTy->getNumElements()), CostKind); @@ -5207,7 +5311,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( // memory access, we need to account for the cost of And-ing the two masks // inside the loop. if (UseMaskForGaps) { - auto *MaskVT = FixedVectorType::get(I8Type, VecTy->getNumElements()); + auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); } } @@ -5248,9 +5352,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( NumOfLoadsInInterleaveGrp; // About a half of the loads may be folded in shuffles when we have only - // one result. If we have more than one result, we do not fold loads at all. + // one result. If we have more than one result, or the loads are masked, + // we do not fold loads at all. unsigned NumOfUnfoldedLoads = - NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; // Get a number of shuffle operations per result. unsigned NumOfShufflesPerResult = |