diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 4018 |
1 files changed, 2603 insertions, 1415 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ed975e9248a8..0f152968ddfd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25,7 +25,9 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -154,17 +156,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } - if (Subtarget.isTargetDarwin()) { - // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. - setUseUnderscoreSetJmp(false); - setUseUnderscoreLongJmp(false); - } else if (Subtarget.isTargetWindowsGNU()) { - // MS runtime is weird: it exports _setjmp, but longjmp! - setUseUnderscoreSetJmp(true); - setUseUnderscoreLongJmp(false); - } else { - setUseUnderscoreSetJmp(true); - setUseUnderscoreLongJmp(true); + if (Subtarget.getTargetTriple().isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); } // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to @@ -217,72 +212,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ShiftOp , MVT::i64 , Custom); } - // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this - // operation. - setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); - if (!Subtarget.useSoftFloat()) { - // We have an algorithm for SSE2->double, and we turn this into a - // 64-bit FILD followed by conditional FADD for other targets. - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this + // operation. + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); - } else { - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand); - } - - // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have - // this operation. - setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); - setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - - if (!Subtarget.useSoftFloat()) { - // SSE has no i16 to fp conversion, only i32. - if (X86ScalarSSEf32) { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); - } else { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); - } - } else { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand); - } - - // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have - // this operation. - setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); - setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - - if (!Subtarget.useSoftFloat()) { + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); + // We have an algorithm for SSE2->double, and we turn this into a + // 64-bit FILD followed by conditional FADD for other targets. + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); + + // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have + // this operation. + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); + // SSE has no i16 to fp conversion, only i32. We promote in the handler + // to allow f80 to use i16 and f64 to use i16 with sse1 only + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); + // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); - } - - // Handle FP_TO_UINT by promoting the destination to a larger signed - // conversion. - setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); - setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); - setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); - - if (!Subtarget.useSoftFloat()) { - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - } + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + + // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have + // this operation. + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); + // FIXME: This doesn't generate invalid exception when it should. PR44019. + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + + // Handle FP_TO_UINT by promoting the destination to a larger signed + // conversion. + setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); + // FIXME: This doesn't generate invalid exception when it should. PR44019. + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + // FIXME: This doesn't generate invalid exception when it should. PR44019. + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + } + + // Handle address space casts between mixed sized pointers. + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { @@ -409,12 +401,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); - // These should be promoted to a larger select which is supported. - setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) @@ -619,6 +611,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } + // Handle constrained floating-point operations of scalar. + setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); @@ -659,6 +665,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LLROUND, MVT::f80, Expand); setOperationAction(ISD::LRINT, MVT::f80, Expand); setOperationAction(ISD::LLRINT, MVT::f80, Expand); + + // Handle constrained floating-point operations of scalar. + setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten + // as Custom. + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); } // f128 uses xmm registers, but most operations require libcalls. @@ -668,22 +685,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps - setOperationAction(ISD::FADD, MVT::f128, Custom); - setOperationAction(ISD::FSUB, MVT::f128, Custom); - setOperationAction(ISD::FDIV, MVT::f128, Custom); - setOperationAction(ISD::FMUL, MVT::f128, Custom); - setOperationAction(ISD::FMA, MVT::f128, Expand); + setOperationAction(ISD::FADD, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); + setOperationAction(ISD::FDIV, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); + setOperationAction(ISD::FMUL, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); + setOperationAction(ISD::FMA, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); setOperationAction(ISD::FABS, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); - setOperationAction(ISD::FSIN, MVT::f128, Expand); - setOperationAction(ISD::FCOS, MVT::f128, Expand); - setOperationAction(ISD::FSINCOS, MVT::f128, Expand); - setOperationAction(ISD::FSQRT, MVT::f128, Expand); - - setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + setOperationAction(ISD::FSIN, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); + setOperationAction(ISD::FCOS, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); + setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); + // No STRICT_FSINCOS + setOperationAction(ISD::FSQRT, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); + + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); // We need to custom handle any FP_ROUND with an f128 input, but // LegalizeDAG uses the result type to know when to run a custom handler. // So we have to list all legal floating point result types here. @@ -820,12 +847,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom); + setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -895,6 +925,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); @@ -933,37 +965,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); - - // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into - // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is - // split again based on the input type, this will cause an AssertSExt i16 to - // be emitted instead of an AssertZExt. This will allow packssdw followed by - // packuswb to be used to truncate to v8i8. This is necessary since packusdw - // isn't available until sse4.1. - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); + } setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); + + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. + setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for @@ -1008,6 +1041,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // With AVX512, expanding (and promoting the shifts) is better. if (!Subtarget.hasAVX512()) setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + + setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -1029,11 +1068,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { - setOperationAction(ISD::FFLOOR, RoundedTy, Legal); - setOperationAction(ISD::FCEIL, RoundedTy, Legal); - setOperationAction(ISD::FTRUNC, RoundedTy, Legal); - setOperationAction(ISD::FRINT, RoundedTy, Legal); - setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); @@ -1072,6 +1116,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // i8 vectors are custom because the source register and source // source memory operand types are not the same width. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + + if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { + // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can + // do the pre and post work in the vector domain. + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); + // We need to mark SINT_TO_FP as Custom even though we want to expand it + // so that DAG combine doesn't try to turn it into uint_to_fp. + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { @@ -1105,25 +1160,45 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FABS, VT, Custom); - setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); + + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); @@ -1169,6 +1244,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); @@ -1180,8 +1257,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, - MVT::v2f64, MVT::v4f64 }) + MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); + } } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1233,6 +1312,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { @@ -1299,12 +1379,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1331,6 +1417,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Custom); @@ -1372,21 +1460,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } - setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); - - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom); + for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) { + setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); + } + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal); + + setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); @@ -1420,11 +1524,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); } @@ -1459,6 +1568,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use @@ -1470,8 +1581,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } @@ -1532,13 +1647,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. - // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); @@ -1563,12 +1690,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::SINT_TO_FP, VT, Legal); - setOperationAction(ISD::UINT_TO_FP, VT, Legal); - setOperationAction(ISD::FP_TO_SINT, VT, Legal); - setOperationAction(ISD::FP_TO_UINT, VT, Legal); - - setOperationAction(ISD::MUL, VT, Legal); + setOperationAction(ISD::SINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::UINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FP_TO_SINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FP_TO_UINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::MUL, VT, Legal); } } @@ -1739,12 +1877,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. - setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && + isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } if (Subtarget.hasBWI()) { @@ -1828,8 +1968,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : - {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, - ISD::FLOG10, ISD::FPOW, ISD::FSIN}) + {ISD::FCEIL, ISD::STRICT_FCEIL, + ISD::FCOS, ISD::STRICT_FCOS, + ISD::FEXP, ISD::STRICT_FEXP, + ISD::FFLOOR, ISD::STRICT_FFLOOR, + ISD::FREM, ISD::STRICT_FREM, + ISD::FLOG, ISD::STRICT_FLOG, + ISD::FLOG10, ISD::STRICT_FLOG10, + ISD::FPOW, ISD::STRICT_FPOW, + ISD::FSIN, ISD::STRICT_FSIN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); @@ -1870,6 +2017,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::STRICT_SINT_TO_FP); + setTargetDAGCombine(ISD::STRICT_UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1901,6 +2050,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); + + // Default to having -disable-strictnode-mutation on + IsStrictFPEnabled = true; } // This has so far only been implemented for 64-bit MachO. @@ -1910,7 +2062,7 @@ bool X86TargetLowering::useLoadStackGuardNode() const { bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. - return Subtarget.getTargetTriple().isOSMSVCRT(); + return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, @@ -1946,9 +2098,13 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return MVT::i8; + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) + return MVT::v32i1; // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && - Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && + Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return MVT::v16i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -1966,9 +2122,13 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return VT.getVectorNumElements(); + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) + return 2; // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && - Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && + Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -1988,6 +2148,15 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) { + RegisterVT = MVT::v32i1; + IntermediateVT = MVT::v32i1; + NumIntermediates = 2; + return 2; + } + return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } @@ -2383,6 +2552,10 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); + const TargetMachine &TM = getTargetMachine(); + if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS)) + return false; + return SrcAS < 256 && DestAS < 256; } @@ -2520,18 +2693,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); - // If this is x86-64, and we disabled SSE, we can't return FP values, - // or SSE or MMX vectors. - if ((ValVT == MVT::f32 || ValVT == MVT::f64 || - VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { + // Report an error if we have attempted to return a value via an XMM + // register and SSE was disabled. + if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } else if (ValVT == MVT::f64 && - (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { - // Likewise we can't return F64 values with SSE1 only. gcc does so, but - // llvm-gcc has never done it right and no one has noticed, so this - // should be OK for now. + } else if (!Subtarget.hasSSE2() && + X86::FR64XRegClass.contains(VA.getLocReg()) && + ValVT == MVT::f64) { + // When returning a double via an XMM register, report an error if SSE2 is + // not enabled. errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } @@ -2826,7 +2997,6 @@ SDValue X86TargetLowering::LowerCallResult( const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - bool Is64Bit = Subtarget.is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); @@ -2845,15 +3015,22 @@ SDValue X86TargetLowering::LowerCallResult( RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); } - // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && - ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { + // Report an error if there was an attempt to return FP values via XMM + // registers. + if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } else if (CopyVT == MVT::f64 && - (Is64Bit && !Subtarget.hasSSE2())) { + if (VA.getLocReg() == X86::XMM1) + VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. + else + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (!Subtarget.hasSSE2() && + X86::FR64XRegClass.contains(VA.getLocReg()) && + CopyVT == MVT::f64) { errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + if (VA.getLocReg() == X86::XMM1) + VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. + else + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -2895,6 +3072,9 @@ SDValue X86TargetLowering::LowerCallResult( Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); } + if (VA.getLocInfo() == CCValAssign::BCvt) + Val = DAG.getBitcast(VA.getValVT(), Val); + InVals.push_back(Val); } @@ -2993,9 +3173,7 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { - auto Attr = - CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); - if (!CI->isTailCall() || Attr.getValueAsString() == "true") + if (!CI->isTailCall()) return false; ImmutableCallSite CS(CI); @@ -3464,8 +3642,8 @@ SDValue X86TargetLowering::LowerFormalArguments( FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); - // Conservatively forward AL on x86_64, since it might be used for varargs. - if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { + // Forward AL for SysV x86_64 targets, since it is used for varargs. + if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } @@ -3618,7 +3796,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); - auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || @@ -3634,9 +3811,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); - if (Attr.getValueAsString() == "true") - isTailCall = false; - if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT @@ -3728,7 +3902,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, "the only memory argument"); } - if (!IsSibcall) + if (!IsSibcall && !IsMustTail) Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, NumBytes - NumBytesToPush, dl); @@ -4013,7 +4187,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector<SDValue, 8> Ops; - if (!IsSibcall && isTailCall) { + if (!IsSibcall && isTailCall && !IsMustTail) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); @@ -4183,23 +4357,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned -X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, - SelectionDAG& DAG) const { - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - unsigned StackAlignment = TFI.getStackAlignment(); - uint64_t AlignMask = StackAlignment - 1; - int64_t Offset = StackSize; - unsigned SlotSize = RegInfo->getSlotSize(); - if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { - // Number smaller than 12 so just add the difference. - Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); - } else { - // Mask out lower bits, add stackalignment once plus the 12 bytes. - Offset = ((~AlignMask) & Offset) + StackAlignment + - (StackAlignment-SlotSize); - } - return Offset; +X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, + SelectionDAG &DAG) const { + const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment()); + const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); + assert(StackSize % SlotSize == 0 && + "StackSize must be a multiple of SlotSize"); + return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; } /// Return true if the given stack call argument is already available in the @@ -4643,8 +4807,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv, } } -/// Return true if the condition is an unsigned comparison operation. -static bool isX86CCUnsigned(unsigned X86CC) { +/// Return true if the condition is an signed comparison operation. +static bool isX86CCSigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); @@ -4654,12 +4818,12 @@ static bool isX86CCUnsigned(unsigned X86CC) { case X86::COND_A: case X86::COND_BE: case X86::COND_AE: - return true; + return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: - return false; + return true; } } @@ -4700,7 +4864,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, // X >= 0 -> X == 0, jump on !sign. return X86::COND_NS; } - if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) { + if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; @@ -4949,12 +5113,6 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } -bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, - bool IsSigned) const { - // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available. - return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov(); -} - bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) @@ -5334,15 +5492,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, static bool canWidenShuffleElements(ArrayRef<int> Mask, const APInt &Zeroable, + bool V2IsZero, SmallVectorImpl<int> &WidenedMask) { - SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end()); - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - if (TargetMask[i] == SM_SentinelUndef) - continue; - if (Zeroable[i]) - TargetMask[i] = SM_SentinelZero; + // Create an alternative mask with info about zeroable elements. + // Here we do not set undef elements as zeroable. + SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end()); + if (V2IsZero) { + assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); + for (int i = 0, Size = Mask.size(); i != Size; ++i) + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + ZeroableMask[i] = SM_SentinelZero; } - return canWidenShuffleElements(TargetMask, WidenedMask); + return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef<int> Mask) { @@ -5764,11 +5925,29 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + + // Do an optimization for the the most frequently used types. + if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { + APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); + Mask0.flipAllBits(); + SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); + SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); + Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); + + // Reduce to original width if needed. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + } + + // Clear the upper bits of the subvector and move it to its insert position. SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); @@ -5850,7 +6029,7 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, "Expected VTs to be the same size!"); unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, - std::max(128U, VT.getSizeInBits() / Scale)); + std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); InVT = In.getValueType(); } @@ -6719,9 +6898,97 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return true; } +/// Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static void computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2, + APInt &KnownUndef, APInt &KnownZero) { + int Size = Mask.size(); + KnownUndef = KnownZero = APInt::getNullValue(Size); + + V1 = peekThroughBitcasts(V1); + V2 = peekThroughBitcasts(V2); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + int VectorSizeInBits = V1.getValueSizeInBits(); + int ScalarSizeInBits = VectorSizeInBits / Size; + assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); + + for (int i = 0; i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0) { + KnownUndef.setBit(i); + continue; + } + if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + KnownZero.setBit(i); + continue; + } + + // Determine shuffle input and normalize the mask. + SDValue V = M < Size ? V1 : V2; + M %= Size; + + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + // If the BUILD_VECTOR has fewer elements then the bitcasted portion of + // the (larger) source element must be UNDEF/ZERO. + if ((Size % V.getNumOperands()) == 0) { + int Scale = Size / V->getNumOperands(); + SDValue Op = V.getOperand(M / Scale); + if (Op.isUndef()) + KnownUndef.setBit(i); + if (X86::isZeroNode(Op)) + KnownZero.setBit(i); + else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { + APInt Val = Cst->getAPIntValue(); + Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); + if (Val == 0) + KnownZero.setBit(i); + } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { + APInt Val = Cst->getValueAPF().bitcastToAPInt(); + Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); + if (Val == 0) + KnownZero.setBit(i); + } + continue; + } + + // If the BUILD_VECTOR has more elements then all the (smaller) source + // elements must be UNDEF or ZERO. + if ((V.getNumOperands() % Size) == 0) { + int Scale = V->getNumOperands() / Size; + bool AllUndef = true; + bool AllZero = true; + for (int j = 0; j < Scale; ++j) { + SDValue Op = V.getOperand((M * Scale) + j); + AllUndef &= Op.isUndef(); + AllZero &= X86::isZeroNode(Op); + } + if (AllUndef) + KnownUndef.setBit(i); + if (AllZero) + KnownZero.setBit(i); + continue; + } + } +} + /// Decode a target shuffle mask and inputs and see if any values are /// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. +/// FIXME: Merge this with computeZeroableShuffleElements? static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, APInt &KnownUndef, APInt &KnownZero) { @@ -6741,7 +7008,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); - assert((VT.getSizeInBits() % Mask.size()) == 0 && + assert((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Size; @@ -6810,7 +7077,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, // Replace target shuffle mask elements with known undef/zero sentinels. static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, const APInt &KnownUndef, - const APInt &KnownZero) { + const APInt &KnownZero, + bool ResolveKnownZeros= true) { unsigned NumElts = Mask.size(); assert(KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); @@ -6818,7 +7086,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, for (unsigned i = 0; i != NumElts; ++i) { if (KnownUndef[i]) Mask[i] = SM_SentinelUndef; - else if (KnownZero[i]) + else if (ResolveKnownZeros && KnownZero[i]) Mask[i] = SM_SentinelZero; } } @@ -8306,7 +8574,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -8552,7 +8820,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, ImmH = DAG.getBitcast(MVT::v32i1, ImmH); DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); } else { - MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U)); + MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; DstVec = DAG.getBitcast(VecVT, Imm); @@ -10130,13 +10398,18 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) { return true; } -/// Test whether there are elements crossing 128-bit lanes in this +/// Test whether there are elements crossing LaneSizeInBits lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. -static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); +static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, + unsigned ScalarSizeInBits, + ArrayRef<int> Mask) { + assert(LaneSizeInBits && ScalarSizeInBits && + (LaneSizeInBits % ScalarSizeInBits) == 0 && + "Illegal shuffle lane size"); + int LaneSize = LaneSizeInBits / ScalarSizeInBits; int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) @@ -10144,6 +10417,12 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { return false; } +/// Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { + return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); +} + /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same @@ -10424,84 +10703,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } -/// Compute whether each element of a shuffle is zeroable. -/// -/// A "zeroable" vector shuffle element is one which can be lowered to zero. -/// Either it is an undef element in the shuffle mask, the element of the input -/// referenced is undef, or the element of the input referenced is known to be -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle -/// as many lanes with this technique as possible to simplify the remaining -/// shuffle. -static APInt computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - APInt Zeroable(Mask.size(), 0); - V1 = peekThroughBitcasts(V1); - V2 = peekThroughBitcasts(V2); - - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - int VectorSizeInBits = V1.getValueSizeInBits(); - int ScalarSizeInBits = VectorSizeInBits / Mask.size(); - assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); - - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable.setBit(i); - continue; - } - - // Determine shuffle input and normalize the mask. - SDValue V = M < Size ? V1 : V2; - M %= Size; - - // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. - if (V.getOpcode() != ISD::BUILD_VECTOR) - continue; - - // If the BUILD_VECTOR has fewer elements then the bitcasted portion of - // the (larger) source element must be UNDEF/ZERO. - if ((Size % V.getNumOperands()) == 0) { - int Scale = Size / V->getNumOperands(); - SDValue Op = V.getOperand(M / Scale); - if (Op.isUndef() || X86::isZeroNode(Op)) - Zeroable.setBit(i); - else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { - APInt Val = Cst->getAPIntValue(); - Val.lshrInPlace((M % Scale) * ScalarSizeInBits); - Val = Val.getLoBits(ScalarSizeInBits); - if (Val == 0) - Zeroable.setBit(i); - } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { - APInt Val = Cst->getValueAPF().bitcastToAPInt(); - Val.lshrInPlace((M % Scale) * ScalarSizeInBits); - Val = Val.getLoBits(ScalarSizeInBits); - if (Val == 0) - Zeroable.setBit(i); - } - continue; - } - - // If the BUILD_VECTOR has more elements then all the (smaller) source - // elements must be UNDEF or ZERO. - if ((V.getNumOperands() % Size) == 0) { - int Scale = V->getNumOperands() / Size; - bool AllZeroable = true; - for (int j = 0; j < Scale; ++j) { - SDValue Op = V.getOperand((M * Scale) + j); - AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); - } - if (AllZeroable) - Zeroable.setBit(i); - continue; - } - } - - return Zeroable; -} - // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. @@ -10616,11 +10817,11 @@ static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } -static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, - unsigned &UnpackOpcode, bool IsUnary, - ArrayRef<int> TargetMask, - const SDLoc &DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, + unsigned &UnpackOpcode, bool IsUnary, + ArrayRef<int> TargetMask, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; @@ -10728,8 +10929,8 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, return SDValue(); } -static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, - int Delta) { +static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, + int Delta) { int Size = (int)Mask.size(); int Split = Size / Delta; int TruncatedVectorStart = SwappedOps ? Size : 0; @@ -10814,8 +11015,8 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, // The first half/quarter of the mask should refer to every second/fourth // element of the vector truncated and bitcasted. - if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) && - !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4)) + if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) && + !matchShuffleAsVPMOV(Mask, SwappedOps, 4)) return SDValue(); return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); @@ -10823,11 +11024,10 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. -static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, - SDValue &V2, unsigned &PackOpcode, - ArrayRef<int> TargetMask, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, + unsigned &PackOpcode, ArrayRef<int> TargetMask, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); MVT PackSVT = MVT::getIntegerVT(BitSize * 2); @@ -10880,8 +11080,8 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; - if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, - Subtarget)) + if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, + Subtarget)) return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), DAG.getBitcast(PackVT, V2)); @@ -10972,10 +11172,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG); -static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, - MutableArrayRef<int> Mask, - const APInt &Zeroable, bool &ForceV1Zero, - bool &ForceV2Zero, uint64_t &BlendMask) { +static bool matchShuffleAsBlend(SDValue V1, SDValue V2, + MutableArrayRef<int> Mask, + const APInt &Zeroable, bool &ForceV1Zero, + bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = @@ -11038,8 +11238,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector<int, 64> Mask(Original.begin(), Original.end()); - if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, - BlendMask)) + if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, + BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. @@ -11161,7 +11361,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -11609,9 +11809,11 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, } /// Try to lower a vector shuffle as a byte shift sequence. -static SDValue lowerVectorShuffleAsByteShiftMask( - const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { +static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(VT.is128BitVector() && "Only 128-bit vectors supported"); @@ -14056,8 +14258,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return BitBlend; // Try to use byte shift instructions to mask. - if (SDValue V = lowerVectorShuffleAsByteShiftMask( - DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; // Try to lower by permuting the inputs into an unpack instruction. @@ -14318,8 +14520,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; // Try to use byte shift instructions to mask. - if (SDValue V = lowerVectorShuffleAsByteShiftMask( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly @@ -14686,6 +14888,36 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, DAG); } +// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). +// TODO: Extend to support v8f32 (+ 512-bit shuffles). +static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT == MVT::v4f64 && "Only for v4f64 shuffles"); + + int LHSMask[4] = {-1, -1, -1, -1}; + int RHSMask[4] = {-1, -1, -1, -1}; + unsigned SHUFPMask = 0; + + // As SHUFPD uses a single LHS/RHS element per lane, we can always + // perform the shuffle once the lanes have been shuffled in place. + for (int i = 0; i != 4; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + int LaneBase = i & ~1; + auto &LaneMask = (i & 1) ? RHSMask : LHSMask; + LaneMask[LaneBase + (M & 1)] = M; + SHUFPMask |= (M & 1) << i; + } + + SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask); + SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask); + return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS, + DAG.getTargetConstant(SHUFPMask, DL, MVT::i8)); +} + /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a lane permutation followed by a per-lane permutation. /// @@ -14764,13 +14996,22 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( int Size = Mask.size(); int LaneSize = Size / 2; + // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). + // Only do this if the elements aren't all from the lower lane, + // otherwise we're (probably) better off doing a split. + if (VT == MVT::v4f64 && + !all_of(Mask, [LaneSize](int M) { return M < LaneSize; })) + if (SDValue V = + lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG)) + return V; + // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize)) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); @@ -14778,7 +15019,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) - LaneUsed[(Mask[i] / LaneSize)] = true; + LaneUsed[(Mask[i] % Size) / LaneSize] = true; if (!LaneUsed[0] || !LaneUsed[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } @@ -14817,8 +15058,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); + bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 4> WidenedMask; - if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask)) + if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; @@ -15637,6 +15880,18 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Op; + // If we have lane crossing shuffles AND they don't all come from the lower + // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). + // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently + // canonicalize to a blend of splat which isn't necessary for this combine. + if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && + !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && + (V1.getOpcode() != ISD::BUILD_VECTOR) && + (V2.getOpcode() != ISD::BUILD_VECTOR)) + if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, + Mask, DAG)) + return Op; + // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) @@ -16950,6 +17205,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; break; case MVT::v64i1: + // Fall back to scalarization. FIXME: We can do better if the shuffle + // can be partitioned cleanly. + if (!Subtarget.useBWIRegs()) + return SDValue(); ExtVT = MVT::v64i8; break; } @@ -17039,8 +17298,8 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. -static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); @@ -17086,29 +17345,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2); + APInt KnownUndef, KnownZero; + computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero); + + APInt Zeroable = KnownUndef | KnownZero; if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); - // Create an alternative mask with info about zeroable elements. - // Here we do not set undef elements as zeroable. - SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end()); - if (V2IsZero) { - assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); - for (int i = 0; i != NumElements; ++i) - if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) - ZeroableMask[i] = SM_SentinelZero; - } - // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - canWidenShuffleElements(ZeroableMask, WidenedMask)) { + canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this @@ -18307,7 +18559,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -18328,8 +18580,13 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); - SDValue Src = Op.getOperand(0); + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); @@ -18346,7 +18603,17 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); + if (IsStrict) { + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, + {Op.getOperand(0), InVec}); + SDValue Chain = CvtVec.getValue(1); + SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Value, Chain}, dl); + } + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } @@ -18415,44 +18682,157 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, DAG.getIntPtrConstant(0, DL)); } +static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDLoc DL(Op); + bool IsStrict = Op->isStrictFPOpcode(); + MVT VT = Op->getSimpleValueType(0); + SDValue Src = Op->getOperand(IsStrict ? 1 : 0); + + if (Subtarget.hasDQI()) { + assert(!Subtarget.hasVLX() && "Unexpected features"); + + assert((Src.getSimpleValueType() == MVT::v2i64 || + Src.getSimpleValueType() == MVT::v4i64) && + "Unsupported custom type"); + + // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. + assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && + "Unexpected VT!"); + MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; + + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) + : DAG.getUNDEF(MVT::v8i64); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, + DAG.getIntPtrConstant(0, DL)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; + } + + bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP || + Op->getOpcode() == ISD::STRICT_SINT_TO_FP; + if (VT != MVT::v4f32 || IsSigned) + return SDValue(); + + SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); + SDValue One = DAG.getConstant(1, DL, MVT::v4i64); + SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, + DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), + DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); + SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src); + SmallVector<SDValue, 4> SignCvts(4); + SmallVector<SDValue, 4> Chains(4); + for (int i = 0; i != 4; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, + DAG.getIntPtrConstant(i, DL)); + if (IsStrict) { + SignCvts[i] = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Src}); + Chains[i] = SignCvts[i].getValue(1); + } else { + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src); + } + } + SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); + + SDValue Slow, Chain; + if (IsStrict) { + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt); + } + + IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg); + SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt); + + if (IsStrict) + return DAG.getMergeValues({Cvt, Chain}, DL); + + return Cvt; +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); + SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - if (VT == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); - if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + // Note: Since v2f64 is a legal type. We don't need to zero extend the + // source for strict FP. + if (IsStrict) + return DAG.getNode( + X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, + {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(SrcVT))}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } + if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64) + return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); + return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); + bool UseSSEReg = isScalarFPTypeInSSEReg(VT); + // These are really Legal; return the operand so the caller accepts it as // Legal. - if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT)) + if (SrcVT == MVT::i32 && UseSSEReg) return Op; - if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) + if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit()) return Op; if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; - SDValue ValueToStore = Op.getOperand(0); - if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && - !Subtarget.is64Bit()) + // SSE doesn't have an i16 conversion so we need to promote. + if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {Chain, Ext}); + + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); + } + + if (VT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); + + SDValue ValueToStore = Src; + if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. @@ -18463,13 +18843,18 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, ValueToStore, StackSlot, + Chain = DAG.getStore( + Chain, dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); - return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); + std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); + + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + + return Tmp.first; } -SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, +std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD @@ -18498,9 +18883,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, FILDOps, SrcVT, LoadMMO); + Chain = Result.getValue(1); if (useSSE) { - Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is glued to the FILD_FLAG. This @@ -18522,9 +18907,10 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); + Chain = Result.getValue(1); } - return Result; + return { Result, Chain }; } /// Horizontal vector math instructions may be slower than normal math with @@ -18532,7 +18918,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool IsOptimizingSize = DAG.shouldOptForSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -18553,6 +18939,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, #endif */ + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -18573,8 +18961,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Op.getOperand(0)); + SDValue XR1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -18587,51 +18975,81 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + SDValue Sub; + SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + if (IsStrict) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), XR2F, CLod1}); + Chain = Sub.getValue(1); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { + if (!IsStrict && Subtarget.hasSSE3() && + shouldUseHorizontalOp(true, DAG, Subtarget)) { + // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + if (IsStrict) { + Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, + {Chain, Shuffle, Sub}); + Chain = Result.getValue(1); + } else + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Result, Chain}, dl); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, - DAG.getIntPtrConstant(0, dl)); + return Result; } /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. - SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - Op.getOperand(0)); + SDValue Load = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); - Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getBitcast(MVT::v2f64, Load), - DAG.getIntPtrConstant(0, dl)); - // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, - DAG.getBitcast(MVT::v2i64, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), + DAG.getBitcast(MVT::v2i64, Load), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); + if (Op.getNode()->isStrictFPOpcode()) { + // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Chain = Op.getOperand(0); + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Chain, Or, Bias}); + + if (Op.getValueType() == Sub.getValueType()) + return Sub; + + // Handle final rounding. + std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound( + Sub, Sub.getValue(1), dl, Op.getSimpleValueType()); + + return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl); + } + // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -18646,38 +19064,123 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); - SDValue N0 = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + + SDValue N0 = Op.getOperand(IsStrict ? 1 : 0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); - // Legalize to v4i32 type. - N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, - DAG.getUNDEF(MVT::v2i32)); + if (Subtarget.hasAVX512()) { + if (!Subtarget.hasVLX()) { + // Let generic type legalization widen this. + if (!IsStrict) + return SDValue(); + // Otherwise pad the integer input with 0s and widen the operation. + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getConstant(0, DL, MVT::v2i32)); + SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other}, + {Op.getOperand(0), N0}); + SDValue Chain = Res.getValue(1); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res, + DAG.getIntPtrConstant(0, DL)); + return DAG.getMergeValues({Res, Chain}, DL); + } - if (Subtarget.hasAVX512()) + // Legalize to v4i32 type. + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), N0}); return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); + } - // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, - // but using v2i32 to v2f64 with X86ISD::CVTSI2P. - SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); - SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); - - // Two to the power of half-word-size. - SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); - - // Clear upper part of LO, lower HI. - SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); - SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); - - SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); - fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); - SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); + // Zero extend to 2i64, OR with the floating point representation of 2^52. + // This gives us the floating point equivalent of 2^52 + the i32 integer + // since double has 52-bits of mantissa. Then subtract 2^52 in floating + // point leaving just our i32 integers in double format. + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0); + SDValue VBias = + DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn, + DAG.getBitcast(MVT::v2i64, VBias)); + Or = DAG.getBitcast(MVT::v2f64, Or); - // Add the two halves. - return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); + if (IsStrict) + return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), Or, VBias}); + return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + SDLoc DL(Op); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue V = Op->getOperand(IsStrict ? 1 : 0); + MVT VecIntVT = V.getSimpleValueType(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + + if (Subtarget.hasAVX512()) { + // With AVX512, but not VLX we need to widen to get a 512-bit result type. + assert(!Subtarget.hasVLX() && "Unexpected features"); + MVT VT = Op->getSimpleValueType(0); + + // v8i32->v8f64 is legal with AVX512 so just return it. + if (VT == MVT::v8f64) + return Op; + + assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && + "Unexpected VT!"); + MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; + MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + SDValue Tmp = + IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT); + V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V, + DAG.getIntPtrConstant(0, DL)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other}, + {Op->getOperand(0), V}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; + } + + if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 && + Op->getSimpleValueType(0) == MVT::v4f64) { + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V); + Constant *Bias = ConstantFP::get( + *DAG.getContext(), + APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL))); + auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8); + SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; + SDValue VBias = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*Alignment*/ 8, MachineMemOperand::MOLoad); + + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, + DAG.getBitcast(MVT::v4i64, VBias)); + Or = DAG.getBitcast(MVT::v4f64, Or); + + if (IsStrict) + return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other}, + {Op.getOperand(0), Or, VBias}); + return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias); + } + // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); @@ -18690,18 +19193,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; - // We shouldn't use it when unsafe-fp-math is enabled though: we might later - // reassociate the two FADDs, and if we do that, the algorithm fails - // spectacularly (PR24512). - // FIXME: If we ever have some kind of Machine FMF, this should be marked - // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because - // there's also the MachineCombiner reassociations happening on Machine IR. - if (DAG.getTarget().Options.UnsafeFPMath) - return SDValue(); - - SDLoc DL(Op); - SDValue V = Op->getOperand(0); - MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, @@ -18709,9 +19200,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); - assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && - "Unsupported custom type"); - // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 @@ -18756,23 +19244,35 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } - // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). - SDValue VecCstFAdd = DAG.getConstantFP( - APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT); + // Create the vector constant for (0x1.0p39f + 0x1.0p23f). + SDValue VecCstFSub = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // NOTE: By using fsub of a positive constant instead of fadd of a negative + // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is + // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? - SDValue FHigh = - DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); - // return (float4) lo + fhi; + // (float4) lo; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); + // return (float4) lo + fhi; + if (IsStrict) { + SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other}, + {Op.getOperand(0), HighBitcast, VecCstFSub}); + return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other}, + {FHigh.getValue(1), LowBitcast, FHigh}); + } + + SDValue FHigh = + DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); @@ -18783,18 +19283,23 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: - assert(!Subtarget.hasAVX512()); return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); + case MVT::v2i64: + case MVT::v4i64: + return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue N0 = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - MVT SrcVT = N0.getSimpleValueType(); - MVT DstVT = Op.getSimpleValueType(); + MVT SrcVT = Src.getSimpleValueType(); + MVT DstVT = Op->getSimpleValueType(0); + SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); @@ -18814,8 +19319,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { - N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); - return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Chain, Src}); + return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); } if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) @@ -18823,7 +19331,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); - if (SrcVT == MVT::i32 && X86ScalarSSEf64) + if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); @@ -18832,23 +19340,28 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); - SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo()); + SDValue Store1 = + DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); - SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); - return Fild; + std::pair<SDValue, SDValue> Tmp = + BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + + return Tmp.first; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue ValueToStore = Op.getOperand(0); - if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) + SDValue ValueToStore = Src; + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) { // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, - MachinePointerInfo()); + } + SDValue Store = + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, @@ -18863,32 +19376,42 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); + Chain = Fild.getValue(1); - APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), - Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); - // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. + // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits. + APInt FF(64, 0x5F80000000000000ULL); SDValue FudgePtr = DAG.getConstantPool( - ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); + ConstantInt::get(*DAG.getContext(), FF), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); - SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); + SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. - // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); + Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? + if (IsStrict) { + SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, + {Chain, Fild, Fudge}); + // STRICT_FP_ROUND can't handle equal types. + if (DstVT == MVT::f80) + return Add; + return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, + {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); + } SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); @@ -18902,11 +19425,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // result. SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned) const { + bool IsSigned, SDValue &Chain) const { + bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); EVT DstTy = Op.getValueType(); - EVT TheVT = Op.getOperand(0).getValueType(); + SDValue Value = Op.getOperand(IsStrict ? 1 : 0); + EVT TheVT = Value.getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { @@ -18920,6 +19445,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; + // FIXME: This does not generate an invalid exception if the input does not + // fit in i32. PR44019 if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. @@ -18938,8 +19465,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getEntryNode(); - SDValue Value = Op.getOperand(0); + Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { @@ -18949,8 +19476,9 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // - // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; - // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Adjust = (Value < Thresh) ? 0 : 0x80000000; + // FltOfs = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value - FltOfs); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. @@ -18975,19 +19503,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); - SDValue Cmp = DAG.getSetCC(DL, - getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), TheVT), - Value, ThreshVal, ISD::SETLT); + EVT ResVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT); + SDValue Cmp; + if (IsStrict) { + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, + Chain, /*IsSignaling*/ true); + Chain = Cmp.getValue(1); + } else { + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT); + } + Adjust = DAG.getSelect(DL, MVT::i64, Cmp, DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(APInt::getSignMask(64), DL, MVT::i64)); - SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); - Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), TheVT), - Value, ThreshVal, ISD::SETLT); - Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, + DAG.getConstantFP(0.0, DL, TheVT), + ThreshVal); + + if (IsStrict) { + Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, + { Chain, Value, FltOfs }); + Chain = Value.getValue(1); + } else + Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs); } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); @@ -19017,6 +19557,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, Ops, DstTy, MMO); SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); + Chain = Res.getValue(1); // If we need an unsigned fixup, XOR the result with adjust. if (UnsignedFixup) @@ -19036,7 +19577,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && "Unexpected extension opcode"); - assert(VT.getVectorNumElements() == VT.getVectorNumElements() && + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || @@ -19512,48 +20053,137 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { - bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; - MVT VT = Op.getSimpleValueType(); - SDValue Src = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT; + MVT VT = Op->getSimpleValueType(0); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); - if (SrcVT == MVT::f128) { - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(SrcVT, VT); - else - LC = RTLIB::getFPTOUINT(SrcVT, VT); - - MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first; - } - if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + unsigned Opc; + if (IsStrict) + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + if (!IsSigned && !Subtarget.hasVLX()) { + assert(Subtarget.useAVX512Regs() && "Unexpected features!"); // Widen to 512-bits. ResVT = MVT::v8i32; TruncVT = MVT::v8i1; - Opc = ISD::FP_TO_UINT; - Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, - DAG.getUNDEF(MVT::v8f64), - Src, DAG.getIntPtrConstant(0, dl)); + Opc = Op.getOpcode(); + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + // TODO: Should we just do this for non-strict as well? + SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64) + : DAG.getUNDEF(MVT::v8f64); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src, + DAG.getIntPtrConstant(0, dl)); + } + SDValue Res, Chain; + if (IsStrict) { + Res = + DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Opc, dl, ResVT, Src); } - SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); + Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, - DAG.getIntPtrConstant(0, dl)); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; + } + + // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32. + if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) { + assert(!IsSigned && "Expected unsigned conversion!"); + assert(Subtarget.useAVX512Regs() && "Requires avx512f"); + return Op; + } + + // Widen vXi32 fp_to_uint with avx512f to 512-bit source. + if ((VT == MVT::v4i32 || VT == MVT::v8i32) && + (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) { + assert(!IsSigned && "Expected unsigned conversion!"); + assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && + "Unexpected features!"); + MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; + MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + // TODO: Should we just do this for non-strict as well? + SDValue Tmp = + IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, + DAG.getIntPtrConstant(0, dl)); + + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; + } + + // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. + if ((VT == MVT::v2i64 || VT == MVT::v4i64) && + (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) { + assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() && + !Subtarget.hasVLX() && "Unexpected features!"); + MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + // TODO: Should we just do this for non-strict as well? + SDValue Tmp = + IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, + DAG.getIntPtrConstant(0, dl)); + + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; } - assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { - return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32))); + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL"); + SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + if (IsStrict) { + unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI + : X86ISD::STRICT_CVTTP2UI; + return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); + } + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + return DAG.getNode(Opc, dl, VT, Tmp); } return SDValue(); @@ -19575,9 +20205,21 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { assert(VT == MVT::i32 && "Unexpected VT!"); // Promote i32 to i64 and use a signed operation on 64-bit targets. + // FIXME: This does not generate an invalid exception if the input does not + // fit in i32. PR44019 if (Subtarget.is64Bit()) { - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other}, + { Op.getOperand(0), Src }); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); + + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + if (IsStrict) + return DAG.getMergeValues({ Res, Chain }, dl); + return Res; } // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can @@ -19586,28 +20228,65 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - // Promote i16 to i32 if we can use a SSE operation. - if (VT == MVT::i16 && UseSSEReg) { + // Promote i16 to i32 if we can use a SSE operation or the type is f128. + // FIXME: This does not generate an invalid exception if the input does not + // fit in i16. PR44019 + if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) { assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other}, + { Op.getOperand(0), Src }); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + if (IsStrict) + return DAG.getMergeValues({ Res, Chain }, dl); + return Res; } - // If this is a SINT_TO_FP using SSEReg we're done. + // If this is a FP_TO_SINT using SSEReg we're done. if (UseSSEReg && IsSigned) return Op; + // fp128 needs to use a libcall. + if (SrcVT == MVT::f128) { + RTLIB::Libcall LC; + if (IsSigned) + LC = RTLIB::getFPTOSINT(SrcVT, VT); + else + LC = RTLIB::getFPTOUINT(SrcVT, VT); + + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + MakeLibCallOptions CallOptions; + std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, + SDLoc(Op), Chain); + + if (IsStrict) + return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + + return Tmp.first; + } + // Fall back to X87. - if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned)) + SDValue Chain; + if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { + if (IsStrict) + return DAG.getMergeValues({V, Chain}, dl); return V; + } llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op->isStrictFPOpcode(); + SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - SDValue In = Op.getOperand(0); + SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); if (VT == MVT::f128) { @@ -19617,14 +20296,19 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); - return DAG.getNode(X86ISD::VFPEXT, DL, VT, - DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, - In, DAG.getUNDEF(SVT))); + SDValue Res = + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT)); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, + {Op->getOperand(0), Res}); + return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); } SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op->isStrictFPOpcode(); + MVT VT = Op.getSimpleValueType(); - SDValue In = Op.getOperand(0); + SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); // It's legal except when f128 is involved @@ -19636,17 +20320,17 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. + + SDLoc dl(Op); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first; -} + std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions, + dl, Chain); -// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking -// the default expansion of STRICT_FP_ROUND. -static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) { - // FIXME: Need to form a libcall with an input chain for f128. - assert(Op.getOperand(0).getValueType() != MVT::f128 && - "Don't know how to handle f128 yet!"); - return Op; + if (IsStrict) + return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + + return Tmp.first; } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -19724,12 +20408,6 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::f128) { - RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128 - : RTLIB::SUB_F128; - return LowerF128Call(Op, DAG, LC); - } - assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); @@ -20013,6 +20691,19 @@ static bool hasNonFlagsUse(SDValue Op) { return false; } +// Transform to an x86-specific ALU node with flags if there is a chance of +// using an RMW op or only the flags are used. Otherwise, leave +// the node alone and emit a 'cmp' or 'test' instruction. +static bool isProfitableToUseFlagOp(SDValue Op) { + for (SDNode *U : Op->uses()) + if (U->getOpcode() != ISD::CopyToReg && + U->getOpcode() != ISD::SETCC && + U->getOpcode() != ISD::STORE) + return false; + + return true; +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, @@ -20076,15 +20767,8 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::SUB: case ISD::OR: case ISD::XOR: - // Transform to an x86-specific ALU node with flags if there is a chance of - // using an RMW op or only the flags are used. Otherwise, leave - // the node alone and emit a 'test' instruction. - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() != ISD::CopyToReg && - UI->getOpcode() != ISD::SETCC && - UI->getOpcode() != ISD::STORE) - goto default_case; + if (!isProfitableToUseFlagOp(Op)) + break; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { @@ -20112,7 +20796,6 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, Op->getOperand(1)).getValue(1); } default: - default_case: break; } @@ -20131,15 +20814,26 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. -SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - const SDLoc &dl, SelectionDAG &DAG) const { +static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1, + unsigned X86CC, const SDLoc &dl, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, bool IsSignaling) { if (isNullConstant(Op1)) - return EmitTest(Op0, X86CC, dl, DAG, Subtarget); + return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain); EVT CmpVT = Op0.getValueType(); - if (CmpVT.isFloatingPoint()) - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + if (CmpVT.isFloatingPoint()) { + if (Chain) { + SDValue Res = + DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, + dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); + return std::make_pair(Res, Res.getValue(1)); + } + return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1), + SDValue()); + } assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); @@ -20154,7 +20848,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = - isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { // For equality comparisons try to use SIGN_EXTEND if the input was // truncate from something with enough sign bits. @@ -20178,10 +20872,22 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } } + + // Try to shrink i64 compares if the input has enough zero bits. + // FIXME: Do this for non-constant compares for constant on LHS? + if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) && + Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. + cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 && + DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) { + CmpVT = MVT::i32; + Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); + Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); + } + // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return Sub.getValue(1); + return std::make_pair(Sub.getValue(1), SDValue()); } /// Convert a comparison if required by the subtarget. @@ -20189,16 +20895,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. - if (Subtarget.hasCMov() || - Cmp.getOpcode() != X86ISD::CMP || - !Cmp.getOperand(0).getValueType().isFloatingPoint() || - !Cmp.getOperand(1).getValueType().isFloatingPoint()) + bool IsCmp = Cmp.getOpcode() == X86ISD::CMP; + bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP || + Cmp.getOpcode() == X86ISD::STRICT_FCMPS; + + if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) || + !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() || + !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: - // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) + // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); @@ -20399,7 +21108,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -20442,7 +21151,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, - SDValue &Op1) { + SDValue &Op1, bool &IsAlwaysSignaling) { unsigned SSECC; bool Swap = false; @@ -20481,6 +21190,22 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, if (Swap) std::swap(Op0, Op1); + switch (SetCCOpcode) { + default: + IsAlwaysSignaling = true; + break; + case ISD::SETEQ: + case ISD::SETOEQ: + case ISD::SETUEQ: + case ISD::SETNE: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETO: + case ISD::SETUO: + IsAlwaysSignaling = false; + break; + } + return SSECC; } @@ -20625,12 +21350,14 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - SDValue CC = Op.getOperand(2); - MVT VT = Op.getSimpleValueType(); + bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || + Op.getOpcode() == ISD::STRICT_FSETCCS; + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); + SDValue CC = Op.getOperand(IsStrict ? 3 : 2); + MVT VT = Op->getSimpleValueType(0); ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get(); - bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); + bool isFP = Op1.getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { @@ -20639,57 +21366,119 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); - Opc = X86ISD::CMPM; + Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { - Opc = X86ISD::CMPP; + Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP; // The SSE/AVX packed FP comparison nodes are defined with a // floating-point vector result that matches the operand type. This allows // them to work with an SSE1 target (integer vector types are not legal). VT = Op0.getSimpleValueType(); } - // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), - // emit two comparisons and a logic op to tie them together. SDValue Cmp; - unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); - if (SSECC >= 8 && !Subtarget.hasAVX()) { - // LLVM predicate is SETUEQ or SETONE. - unsigned CC0, CC1; - unsigned CombineOpc; - if (Cond == ISD::SETUEQ) { - CC0 = 3; // UNORD - CC1 = 0; // EQ - CombineOpc = X86ISD::FOR; + bool IsAlwaysSignaling; + unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling); + if (!Subtarget.hasAVX()) { + // TODO: We could use following steps to handle a quiet compare with + // signaling encodings. + // 1. Get ordered masks from a quiet ISD::SETO + // 2. Use the masks to mask potential unordered elements in operand A, B + // 3. Get the compare results of masked A, B + // 4. Calculating final result using the mask and result from 3 + // But currently, we just fall back to scalar operations. + if (IsStrict && IsAlwaysSignaling && !IsSignaling) + return SDValue(); + + // Insert an extra signaling instruction to raise exception. + if (IsStrict && !IsAlwaysSignaling && IsSignaling) { + SDValue SignalCmp = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS + // FIXME: It seems we need to update the flags of all new strict nodes. + // Otherwise, mayRaiseFPException in MI will return false due to + // NoFPExcept = false by default. However, I didn't find it in other + // patches. + SignalCmp->setFlags(Op->getFlags()); + Chain = SignalCmp.getValue(1); + } + + // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), + // emit two comparisons and a logic op to tie them together. + if (SSECC >= 8) { + // LLVM predicate is SETUEQ or SETONE. + unsigned CC0, CC1; + unsigned CombineOpc; + if (Cond == ISD::SETUEQ) { + CC0 = 3; // UNORD + CC1 = 0; // EQ + CombineOpc = X86ISD::FOR; + } else { + assert(Cond == ISD::SETONE); + CC0 = 7; // ORD + CC1 = 4; // NEQ + CombineOpc = X86ISD::FAND; + } + + SDValue Cmp0, Cmp1; + if (IsStrict) { + Cmp0 = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)}); + Cmp1 = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)}); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1), + Cmp1.getValue(1)); + } else { + Cmp0 = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)); + Cmp1 = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)); + } + Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { - assert(Cond == ISD::SETONE); - CC0 = 7; // ORD - CC1 = 4; // NEQ - CombineOpc = X86ISD::FAND; + if (IsStrict) { + Cmp = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); + Chain = Cmp.getValue(1); + } else + Cmp = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } - - SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getTargetConstant(CC0, dl, MVT::i8)); - SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getTargetConstant(CC1, dl, MVT::i8)); - Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { // Handle all other FP comparisons here. - Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getTargetConstant(SSECC, dl, MVT::i8)); + if (IsStrict) { + // Make a flip on already signaling CCs before setting bit 4 of AVX CC. + SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4; + Cmp = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); + Chain = Cmp.getValue(1); + } else + Cmp = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the // result type of SETCC. The bitcast is expected to be optimized away // during combining/isel. - if (Opc == X86ISD::CMPP) - Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + + if (IsStrict) + return DAG.getMergeValues({Cmp, Chain}, dl); return Cmp; } + assert(!IsStrict && "Strict SETCC only handles FP operands."); + MVT VTOp0 = Op0.getSimpleValueType(); (void)VTOp0; assert(VTOp0 == Op1.getSimpleValueType() && @@ -20860,6 +21649,30 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { assert(Subtarget.hasSSE2() && "Don't know how to lower!"); + // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle + // the odd elements over the even elements. + if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) { + Op0 = DAG.getConstant(0, dl, MVT::v4i32); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + static const int MaskHi[] = { 1, 1, 3, 3 }; + SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + return DAG.getBitcast(VT, Result); + } + + if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) { + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op1 = DAG.getConstant(-1, dl, MVT::v4i32); + + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + static const int MaskHi[] = { 1, 1, 3, 3 }; + SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + return DAG.getBitcast(VT, Result); + } + // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. @@ -20999,8 +21812,9 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, /// corresponding X86 condition code constant in X86CC. SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, - SelectionDAG &DAG, - SDValue &X86CC) const { + SelectionDAG &DAG, SDValue &X86CC, + SDValue &Chain, + bool IsSignaling) const { // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). @@ -21043,12 +21857,32 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, } } + // Try to use the carry flag from the add in place of an separate CMP for: + // (seteq (add X, -1), -1). Similar for setne. + if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD && + Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + if (isProfitableToUseFlagOp(Op0)) { + SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); + + SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0), + Op0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New); + X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8); + return SDValue(New.getNode(), 1); + } + } + bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); if (CondCode == X86::COND_INVALID) return SDValue(); - SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG); + std::pair<SDValue, SDValue> Tmp = + EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling); + SDValue EFLAGS = Tmp.first; + if (Chain) + Chain = Tmp.second; EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; @@ -21056,35 +21890,48 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); + bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || + Op.getOpcode() == ISD::STRICT_FSETCCS; + MVT VT = Op->getSimpleValueType(0); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); SDLoc dl(Op); - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + ISD::CondCode CC = + cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get(); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1); + softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain, + Op.getOpcode() == ISD::STRICT_FSETCCS); // If softenSetCCOperands returned a scalar, use it. if (!Op1.getNode()) { assert(Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); + if (IsStrict) + return DAG.getMergeValues({Op0, Chain}, dl); return Op0; } } SDValue X86CC; - SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); + SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain, + Op.getOpcode() == ISD::STRICT_FSETCCS); if (!EFLAGS) return SDValue(); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + + return Res; } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { @@ -21215,8 +22062,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); - unsigned SSECC = translateX86FSETCC( - cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); + bool IsAlwaysSignaling; + unsigned SSECC = + translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(), + CondOp0, CondOp1, IsAlwaysSignaling); if (Subtarget.hasAVX512()) { SDValue Cmp = @@ -21454,8 +22303,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (AddTest) { CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); - Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()), - X86::COND_NE, DL, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); } // a < b ? -1 : 0 -> RES = ~setcc_carry @@ -21711,7 +22559,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); - assert(VT.getVectorNumElements() == VT.getVectorNumElements() && + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || @@ -21765,12 +22613,14 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { "Expecting 256/512-bit op"); // Splitting volatile memory ops is not allowed unless the operation was not - // legal to begin with. We are assuming the input op is legal (this transform - // is only used for targets with AVX). + // legal to begin with. Assume the input store is legal (this transform is + // only used for targets with AVX). Note: It is possible that we have an + // illegal type like v2i128, and so we could allow splitting a volatile store + // in that case if that is important. if (!Store->isSimple()) return SDValue(); - MVT StoreVT = StoredVal.getSimpleValueType(); + EVT StoreVT = StoredVal.getValueType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; unsigned HalfAlign = (128 == HalfSize ? 16 : 32); @@ -22174,8 +23024,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); - Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()), - X86Cond, dl, DAG); + Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -22201,7 +23050,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = Op.getConstantOperandVal(2); + MaybeAlign Alignment(Op.getConstantOperandVal(2)); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack @@ -22221,11 +23070,12 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - unsigned StackAlign = TFI.getStackAlignment(); + const Align StackAlign(TFI.getStackAlignment()); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value - if (Align > StackAlign) - Result = DAG.getNode(ISD::AND, dl, VT, Result, - DAG.getConstant(-(uint64_t)Align, dl, VT)); + if (Alignment && Alignment > StackAlign) + Result = + DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -22256,9 +23106,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); - if (Align) { + if (Alignment) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } @@ -22777,6 +23627,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntNo = Op.getConstantOperandVal(0); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: { @@ -22794,7 +23645,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1)); } case INTR_TYPE_1OP_SAE: { SDValue Sae = Op.getOperand(2); @@ -22866,7 +23718,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), - Src1, Src2, Src3); + {Src1, Src2, Src3}); } case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), @@ -22890,8 +23742,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode( + DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, + Subtarget, DAG); } case INTR_TYPE_1OP_MASK_SAE: { SDValue Src = Op.getOperand(1); @@ -22907,8 +23760,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else return SDValue(); - return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru, + Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); @@ -23114,8 +23967,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); } //default rounding mode - return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC); + return DAG.getNode(IntrData->Opc0, dl, MaskVT, + {Op.getOperand(1), Op.getOperand(2), CC}); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -23315,8 +24168,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, - Mask); + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), + {Src, PassThru, Mask}); } case CVTPS2PH_MASK: { SDValue Src = Op.getOperand(1); @@ -23622,9 +24475,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue ShAmt = Op.getOperand(2); // If the argument is a constant, convert it to a target constant. if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) { - ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32); + // Clamp out of bounds shift amounts since they will otherwise be masked + // to 8-bits which may make it no longer out of bounds. + unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), - Op.getOperand(0), Op.getOperand(1), ShAmt); + Op.getOperand(0), Op.getOperand(1), + DAG.getTargetConstant(ShiftAmount, DL, MVT::i32)); } unsigned NewIntrinsic; @@ -23977,7 +24833,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during FinalizeISel in EmitInstrWithCustomInserter. - return SDValue(); + return Op; } case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: @@ -24152,9 +25008,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + SDValue Offset = DAG.getUNDEF(VMask.getValueType()); - return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT, - MemIntr->getMemOperand(), true /* truncating */); + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask, + MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED, + true /* truncating */); } case X86ISD::VTRUNCUS: case X86ISD::VTRUNCS: { @@ -24249,7 +25107,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, +Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); @@ -24538,12 +25396,13 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - unsigned StackAlignment = TFI.getStackAlignment(); + const Align StackAlignment(TFI.getStackAlignment()); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot - int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false); + int SSFI = + MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); @@ -27464,12 +28323,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) return Op; - SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), - N->getBasePtr(), Mask, - getZeroVector(VT, Subtarget, DAG, dl), - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType(), - N->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), + N->isExpandingLoad()); // Emit a blend. SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, PassThru); @@ -27503,11 +28361,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); - SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), - N->getBasePtr(), Mask, PassThru, - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType(), - N->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), + N->getExtensionType(), N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), @@ -27553,7 +28410,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), - Mask, N->getMemoryVT(), N->getMemOperand(), + N->getOffset(), Mask, N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore()); } @@ -27607,29 +28465,31 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl); } -SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, - SelectionDAG &DAG) const { - // TODO: Eventually, the lowering of these nodes should be informed by or - // deferred to the GC strategy for the function in which they appear. For - // now, however, they must be lowered to something. Since they are logically - // no-ops in the case of a null GC strategy (or a GC strategy which does not - // require special handling for these nodes), lower them as literal NOOPs for - // the time being. - SmallVector<SDValue, 2> Ops; +static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + MVT DstVT = Op.getSimpleValueType(); - Ops.push_back(Op.getOperand(0)); - if (Op->getGluedNode()) - Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); + AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode()); + unsigned SrcAS = N->getSrcAddressSpace(); - SDLoc OpDL(Op); - SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); + assert(SrcAS != N->getDestAddressSpace() && + "addrspacecast must be between different address spaces"); - return NOOP; + if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) { + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); + } else if (DstVT == MVT::i64) { + Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); + } else if (DstVT == MVT::i32) { + Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); + } else { + report_fatal_error("Bad address space in addrspacecast"); + } + return Op; } -SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op, + SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically @@ -27651,9 +28511,21 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + + bool IsStrict = Op->isStrictFPOpcode(); + unsigned Offset = IsStrict ? 1 : 0; + SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end()); + + SDLoc dl(Op); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; + std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops, + CallOptions, dl, Chain); + + if (IsStrict) + return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + + return Tmp.first; } /// Provide custom lowering hooks for some operations. @@ -27673,7 +28545,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); - case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); + case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); @@ -27690,7 +28562,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); @@ -27700,21 +28574,24 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); - case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); - case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG); + case ISD::STRICT_FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); - case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); - case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); - case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCC: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); @@ -27778,8 +28655,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: - return LowerGC_TRANSITION_START(Op, DAG); - case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); + case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); + case ISD::ADDRSPACECAST: + return LowerADDRSPACECAST(Op, DAG); } } @@ -27865,8 +28743,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case X86ISD::VPMADDWD: case X86ISD::AVG: { - // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and - // X86ISD::AVG/VPMADDWD by widening. + // Legalize types for X86ISD::AVG/VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -28114,10 +28991,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: { - bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + case ISD::STRICT_FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_UINT: { + bool IsStrict = N->isStrictFPOpcode(); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT || + N->getOpcode() == ISD::STRICT_FP_TO_SINT; EVT VT = N->getValueType(0); - SDValue Src = N->getOperand(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (VT.isVector() && VT.getScalarSizeInBits() < 32) { @@ -28128,13 +29009,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), VT.getVectorNumElements()); - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); + SDValue Res; + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other}, + {N->getOperand(0), Src}); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); // Preserve what we know about the size of the original result. Except // when the result is v2i32 since we can't widen the assert. if (PromoteVT != MVT::v2i32) - Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext - : ISD::AssertSext, + Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl, PromoteVT, Res, DAG.getValueType(VT.getVectorElementType())); @@ -28149,6 +29036,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); return; } @@ -28160,16 +29049,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { + unsigned Opc; + if (IsStrict) + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + + // If we have VLX we can emit a target specific FP_TO_UINT node,. if (!IsSigned && !Subtarget.hasVLX()) { - // If we have VLX we can emit a target specific FP_TO_UINT node, - // otherwise we can defer to the generic legalizer which will widen + // Otherwise we can defer to the generic legalizer which will widen // the input as well. This will be further widened during op // legalization to v8i32<-v8f64. - return; + // For strict nodes we'll need to widen ourselves. + // FIXME: Fix the type legalizer to safely widen strict nodes? + if (!IsStrict) + return; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src, + DAG.getConstantFP(0.0, dl, MVT::v2f64)); + Opc = N->getOpcode(); + } + SDValue Res; + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, + {N->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); } - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; - SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); + return; + } + + // Custom widen strict v2f32->v2i32 by padding with zeros. + // FIXME: Should generic type legalizer do this? + if (Src.getValueType() == MVT::v2f32 && IsStrict) { + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getConstantFP(0.0, dl, MVT::v2f32)); + SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); return; } @@ -28183,64 +29105,168 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); - unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; - // Using a 256-bit input here to guarantee 128-bit input for f32 case. - // TODO: Use 128-bit vectors for f64 case? - // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI. + unsigned NumElts = Subtarget.hasVLX() ? 2 : 8; + // If we use a 128-bit result we might need to use a target specific node. + unsigned SrcElts = + std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits()); MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts); - MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts); + MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts); + unsigned Opc = N->getOpcode(); + if (NumElts != SrcElts) { + if (IsStrict) + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + } SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx); - Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res); + SDValue Chain; + if (IsStrict) { + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); + Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); return; } - if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned)) + SDValue Chain; + if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) { Results.push_back(V); + if (IsStrict) + Results.push_back(Chain); + } return; } - case ISD::SINT_TO_FP: { - assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); - SDValue Src = N->getOperand(0); - if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64) - return; - Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src)); - return; - } - case ISD::UINT_TO_FP: { - assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); + case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: { + bool IsStrict = N->isStrictFPOpcode(); + bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP || + N->getOpcode() == ISD::STRICT_SINT_TO_FP; EVT VT = N->getValueType(0); if (VT != MVT::v2f32) return; - SDValue Src = N->getOperand(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { - Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src)); + if (IsStrict) { + unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P + : X86ISD::STRICT_CVTUI2P; + SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } else { + unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; + Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src)); + } return; } + if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && + Subtarget.hasSSE41() && !Subtarget.hasAVX512()) { + SDValue Zero = DAG.getConstant(0, dl, SrcVT); + SDValue One = DAG.getConstant(1, dl, SrcVT); + SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, + DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), + DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); + SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); + SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); + for (int i = 0; i != 2; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + SignSrc, DAG.getIntPtrConstant(i, dl)); + if (IsStrict) + SignCvts[i] = + DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other}, + {N->getOperand(0), Src}); + else + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src); + }; + SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); + SDValue Slow, Chain; + if (IsStrict) { + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SignCvts[0].getValue(1), SignCvts[1].getValue(1)); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt); + } + IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg); + IsNeg = + DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1}); + SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt); + Results.push_back(Cvt); + if (IsStrict) + Results.push_back(Chain); + return; + } + if (SrcVT != MVT::v2i32) return; + + if (IsSigned || Subtarget.hasAVX512()) { + if (!IsStrict) + return; + + // Custom widen strict v2i32->v2f32 to avoid scalarization. + // FIXME: Should generic type legalizer do this? + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getConstant(0, dl, MVT::v2i32)); + SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + return; + } + + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); SDValue VBias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); - // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); - Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + if (IsStrict) { + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {N->getOperand(0), Or, VBias}); + SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, + {MVT::v4f32, MVT::Other}, + {Sub.getValue(1), Sub}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } else { + // TODO: Are there any fast-math-flags to propagate here? + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + } return; } + case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: { - if (!isTypeLegal(N->getOperand(0).getValueType())) - return; - SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); + if (!isTypeLegal(Src.getValueType())) + return; + SDValue V; + if (IsStrict) + V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), N->getOperand(1)}); + else + V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); Results.push_back(V); + if (IsStrict) + Results.push_back(V.getValue(1)); return; } case ISD::FP_EXTEND: { @@ -28543,6 +29569,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res.getValue(1)); return; } + case ISD::ADDRSPACECAST: { + SDValue Src = N->getOperand(0); + EVT DstVT = N->getValueType(0); + AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); + unsigned SrcAS = CastN->getSrcAddressSpace(); + + assert(SrcAS != CastN->getDestAddressSpace() && + "addrspacecast must be between different address spaces"); + + SDValue Res; + if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) + Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); + else if (DstVT == MVT::i64) + Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); + else if (DstVT == MVT::i32) + Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); + else + report_fatal_error("Unrecognized addrspacecast type legalization"); + + Results.push_back(Res); + return; + } } } @@ -28566,9 +29614,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; + case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP"; + case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; + case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM"; case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; @@ -28653,10 +29704,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT"; case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND"; case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; @@ -28676,6 +29729,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::VPPERM: return "X86ISD::VPPERM"; case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; @@ -28776,6 +29830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE"; case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; @@ -28837,6 +29892,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI"; case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; + case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI"; + case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI"; case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; @@ -28847,6 +29904,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; + case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P"; + case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P"; case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; @@ -29099,8 +30158,8 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return true; } -bool -X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { +bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const { if (!Subtarget.hasAnyFMA()) return false; @@ -31518,28 +32577,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case X86ISD::VSRAI: case X86ISD::VSHLI: case X86ISD::VSRLI: { - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { - Known.setAllZero(); - break; - } + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt >= VT.getScalarSizeInBits()) { + Known.setAllZero(); + break; + } - Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - unsigned ShAmt = ShiftImm->getZExtValue(); - if (Opc == X86ISD::VSHLI) { - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; - // Low bits are known zero. - Known.Zero.setLowBits(ShAmt); - } else if (Opc == X86ISD::VSRLI) { - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); - // High bits are known zero. - Known.Zero.setHighBits(ShAmt); - } else { - Known.Zero.ashrInPlace(ShAmt); - Known.One.ashrInPlace(ShAmt); - } + Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Opc == X86ISD::VSHLI) { + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; + // Low bits are known zero. + Known.Zero.setLowBits(ShAmt); + } else if (Opc == X86ISD::VSRLI) { + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + // High bits are known zero. + Known.Zero.setHighBits(ShAmt); + } else { + Known.Zero.ashrInPlace(ShAmt); + Known.One.ashrInPlace(ShAmt); } break; } @@ -32103,8 +33160,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) || ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) || ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) { - if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, - Subtarget)) { + if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, + Subtarget)) { DstVT = MaskVT; return true; } @@ -32116,8 +33173,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { - if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, - DAG, Subtarget)) { + if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, + Subtarget)) { SrcVT = DstVT = MaskVT; if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); @@ -32155,8 +33212,8 @@ static bool matchBinaryPermuteShuffle( uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); - if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, - ForceV2Zero, BlendMask)) { + if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, + ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector<int, 8> RepeatedMask; @@ -32410,10 +33467,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. - APInt Zeroable(NumMaskElts, 0); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (isUndefOrZero(Mask[i])) - Zeroable.setBit(i); + APInt KnownUndef, KnownZero; + resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); + APInt Zeroable = KnownUndef | KnownZero; if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load @@ -32834,7 +33890,8 @@ static SDValue combineX86ShuffleChainWithExtract( Offset += Src.getConstantOperandVal(1); Src = Src.getOperand(0); } - WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits()); + WideSizeInBits = std::max(WideSizeInBits, + (unsigned)Src.getValueSizeInBits()); assert((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"); Offset /= BaseVT.getVectorNumElements(); @@ -33026,6 +34083,10 @@ static SDValue combineX86ShufflesRecursively( ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth, bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + assert(RootMask.size() > 0 && + (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && + "Illegal shuffle root mask"); + // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. const unsigned MaxRecursionDepth = 8; @@ -33056,106 +34117,137 @@ static SDValue combineX86ShufflesRecursively( OpZero, DAG, Depth, false)) return SDValue(); - resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); - - // Add the inputs to the Ops list, avoiding duplicates. - SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); - - auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { - // Attempt to find an existing match. - SDValue InputBC = peekThroughBitcasts(Input); - for (int i = 0, e = Ops.size(); i < e; ++i) - if (InputBC == peekThroughBitcasts(Ops[i])) - return i; - // Match failed - should we replace an existing Op? - if (InsertionPoint >= 0) { - Ops[InsertionPoint] = Input; - return InsertionPoint; + SmallVector<int, 64> Mask; + SmallVector<SDValue, 16> Ops; + + // We don't need to merge masks if the root is empty. + bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1); + if (EmptyRoot) { + // Only resolve zeros if it will remove an input, otherwise we might end + // up in an infinite loop. + bool ResolveKnownZeros = true; + if (!OpZero.isNullValue()) { + APInt UsedInputs = APInt::getNullValue(OpInputs.size()); + for (int i = 0, e = OpMask.size(); i != e; ++i) { + int M = OpMask[i]; + if (OpUndef[i] || OpZero[i] || isUndefOrZero(M)) + continue; + UsedInputs.setBit(M / OpMask.size()); + if (UsedInputs.isAllOnesValue()) { + ResolveKnownZeros = false; + break; + } + } } - // Add to the end of the Ops list. - Ops.push_back(Input); - return Ops.size() - 1; - }; + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero, + ResolveKnownZeros); - SmallVector<int, 2> OpInputIdx; - for (SDValue OpInput : OpInputs) - OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); - - assert(((RootMask.size() > OpMask.size() && - RootMask.size() % OpMask.size() == 0) || - (OpMask.size() > RootMask.size() && - OpMask.size() % RootMask.size() == 0) || - OpMask.size() == RootMask.size()) && - "The smaller number of elements must divide the larger."); - - // This function can be performance-critical, so we rely on the power-of-2 - // knowledge that we have about the mask sizes to replace div/rem ops with - // bit-masks and shifts. - assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); - assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); - unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); - unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); - - unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size()); - unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2); - unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2); - assert((RootRatio == 1 || OpRatio == 1) && - "Must not have a ratio for both incoming and op masks!"); - - assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); - assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); - assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); - unsigned RootRatioLog2 = countTrailingZeros(RootRatio); - unsigned OpRatioLog2 = countTrailingZeros(OpRatio); - - SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef); - - // Merge this shuffle operation's mask into our accumulated mask. Note that - // this shuffle's mask will be the first applied to the input, followed by the - // root mask to get us all the way to the root value arrangement. The reason - // for this order is that we are recursing up the operation chain. - for (unsigned i = 0; i < MaskWidth; ++i) { - unsigned RootIdx = i >> RootRatioLog2; - if (RootMask[RootIdx] < 0) { - // This is a zero or undef lane, we're done. - Mask[i] = RootMask[RootIdx]; - continue; - } + Mask = OpMask; + Ops.append(OpInputs.begin(), OpInputs.end()); + } else { + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); + + // Add the inputs to the Ops list, avoiding duplicates. + Ops.append(SrcOps.begin(), SrcOps.end()); + + auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { + // Attempt to find an existing match. + SDValue InputBC = peekThroughBitcasts(Input); + for (int i = 0, e = Ops.size(); i < e; ++i) + if (InputBC == peekThroughBitcasts(Ops[i])) + return i; + // Match failed - should we replace an existing Op? + if (InsertionPoint >= 0) { + Ops[InsertionPoint] = Input; + return InsertionPoint; + } + // Add to the end of the Ops list. + Ops.push_back(Input); + return Ops.size() - 1; + }; - unsigned RootMaskedIdx = - RootRatio == 1 - ? RootMask[RootIdx] - : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); + SmallVector<int, 2> OpInputIdx; + for (SDValue OpInput : OpInputs) + OpInputIdx.push_back( + AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); + + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + + // This function can be performance-critical, so we rely on the power-of-2 + // knowledge that we have about the mask sizes to replace div/rem ops with + // bit-masks and shifts. + assert(isPowerOf2_32(RootMask.size()) && + "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); + unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); + + unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size()); + unsigned RootRatio = + std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2); + unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2); + assert((RootRatio == 1 || OpRatio == 1) && + "Must not have a ratio for both incoming and op masks!"); + + assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootRatioLog2 = countTrailingZeros(RootRatio); + unsigned OpRatioLog2 = countTrailingZeros(OpRatio); + + Mask.resize(MaskWidth, SM_SentinelUndef); + + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by + // the root mask to get us all the way to the root value arrangement. The + // reason for this order is that we are recursing up the operation chain. + for (unsigned i = 0; i < MaskWidth; ++i) { + unsigned RootIdx = i >> RootRatioLog2; + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask[i] = RootMask[RootIdx]; + continue; + } - // Just insert the scaled root mask value if it references an input other - // than the SrcOp we're currently inserting. - if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || - (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { - Mask[i] = RootMaskedIdx; - continue; - } + unsigned RootMaskedIdx = + RootRatio == 1 + ? RootMask[RootIdx] + : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); - RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); - unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; - if (OpMask[OpIdx] < 0) { - // The incoming lanes are zero or undef, it doesn't matter which ones we - // are using. - Mask[i] = OpMask[OpIdx]; - continue; - } + // Just insert the scaled root mask value if it references an input other + // than the SrcOp we're currently inserting. + if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || + (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { + Mask[i] = RootMaskedIdx; + continue; + } - // Ok, we have non-zero lanes, map them through to one of the Op's inputs. - unsigned OpMaskedIdx = - OpRatio == 1 - ? OpMask[OpIdx] - : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); + RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); + unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask[i] = OpMask[OpIdx]; + continue; + } - OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); - int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); - assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); - OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; + // Ok, we have non-zero lanes, map them through to one of the Op's inputs. + unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx] + : (OpMask[OpIdx] << OpRatioLog2) + + (RootMaskedIdx & (OpRatio - 1)); - Mask[i] = OpMaskedIdx; + OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); + int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); + assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); + OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; + + Mask[i] = OpMaskedIdx; + } } // Remove unused/repeated shuffle source ops. @@ -33189,13 +34281,18 @@ static SDValue combineX86ShufflesRecursively( // the remaining recursion depth. if (Ops.size() < (MaxRecursionDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { + // For empty roots, we need to resolve zeroable elements before combining + // them with other shuffles. + SmallVector<int, 64> ResolvedMask = Mask; + if (EmptyRoot) + resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero); bool AllowVar = false; if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, - AllowVar, DAG, Subtarget)) + Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, + HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } } @@ -34207,6 +35304,15 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, In.getOperand(0).getValueType() == MVT::v2i64) return N->getOperand(0); // return the bitcast break; + case X86ISD::STRICT_CVTTP2SI: + case X86ISD::STRICT_CVTTP2UI: + case X86ISD::STRICT_CVTSI2P: + case X86ISD::STRICT_CVTUI2P: + case X86ISD::STRICT_VFPROUND: + if (In.getOperand(1).getValueType() == MVT::v2f64 || + In.getOperand(1).getValueType() == MVT::v2i64) + return N->getOperand(0); + break; } } @@ -34698,6 +35804,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; } + // If we don't demand all elements, then attempt to combine to a simpler + // shuffle. + // TODO: Handle other depths, but first we need to handle the fact that + // it might combine to the same shuffle. + if (!DemandedElts.isAllOnesValue() && Depth == 0) { + SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef); + for (int i = 0; i != NumElts; ++i) + if (DemandedElts[i]) + DemandedMask[i] = i; + + SDValue NewShuffle = combineX86ShufflesRecursively( + {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false, + /*AllowVarMask*/ true, TLO.DAG, Subtarget); + if (NewShuffle) + return TLO.CombineTo(Op, NewShuffle); + } + return false; } @@ -34739,117 +35862,110 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } case X86ISD::VSHLI: { SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) { - if (ShiftImm->getAPIntValue().uge(BitWidth)) - break; + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt >= BitWidth) + break; - unsigned ShAmt = ShiftImm->getZExtValue(); - APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); - - // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a - // single shift. We can do this if the bottom bits (which are shifted - // out) are never demanded. - if (Op0.getOpcode() == X86ISD::VSRLI && - OriginalDemandedBits.countTrailingZeros() >= ShAmt) { - if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) { - if (Shift2Imm->getAPIntValue().ult(BitWidth)) { - int Diff = ShAmt - Shift2Imm->getZExtValue(); - if (Diff == 0) - return TLO.CombineTo(Op, Op0.getOperand(0)); - - unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; - SDValue NewShift = TLO.DAG.getNode( - NewOpc, SDLoc(Op), VT, Op0.getOperand(0), - TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); - return TLO.CombineTo(Op, NewShift); - } - } + APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); + + // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a + // single shift. We can do this if the bottom bits (which are shifted + // out) are never demanded. + if (Op0.getOpcode() == X86ISD::VSRLI && + OriginalDemandedBits.countTrailingZeros() >= ShAmt) { + unsigned Shift2Amt = Op0.getConstantOperandVal(1); + if (Shift2Amt < BitWidth) { + int Diff = ShAmt - Shift2Amt; + if (Diff == 0) + return TLO.CombineTo(Op, Op0.getOperand(0)); + + unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; + SDValue NewShift = TLO.DAG.getNode( + NewOpc, SDLoc(Op), VT, Op0.getOperand(0), + TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); + return TLO.CombineTo(Op, NewShift); } + } - if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, - TLO, Depth + 1)) - return true; + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) + return true; - assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; - // Low bits known zero. - Known.Zero.setLowBits(ShAmt); - } + // Low bits known zero. + Known.Zero.setLowBits(ShAmt); break; } case X86ISD::VSRLI: { - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - if (ShiftImm->getAPIntValue().uge(BitWidth)) - break; + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt >= BitWidth) + break; - unsigned ShAmt = ShiftImm->getZExtValue(); - APInt DemandedMask = OriginalDemandedBits << ShAmt; + APInt DemandedMask = OriginalDemandedBits << ShAmt; - if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, - OriginalDemandedElts, Known, TLO, Depth + 1)) - return true; + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, + OriginalDemandedElts, Known, TLO, Depth + 1)) + return true; - assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); - // High bits known zero. - Known.Zero.setHighBits(ShAmt); - } + // High bits known zero. + Known.Zero.setHighBits(ShAmt); break; } case X86ISD::VSRAI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) { - if (ShiftImm->getAPIntValue().uge(BitWidth)) - break; + unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue(); + if (ShAmt >= BitWidth) + break; - unsigned ShAmt = ShiftImm->getZExtValue(); - APInt DemandedMask = OriginalDemandedBits << ShAmt; + APInt DemandedMask = OriginalDemandedBits << ShAmt; - // If we just want the sign bit then we don't need to shift it. - if (OriginalDemandedBits.isSignMask()) - return TLO.CombineTo(Op, Op0); + // If we just want the sign bit then we don't need to shift it. + if (OriginalDemandedBits.isSignMask()) + return TLO.CombineTo(Op, Op0); - // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 - if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) { - SDValue Op00 = Op0.getOperand(0); - unsigned NumSignBits = - TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); - if (ShAmt < NumSignBits) - return TLO.CombineTo(Op, Op00); - } + // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 + if (Op0.getOpcode() == X86ISD::VSHLI && + Op.getOperand(1) == Op0.getOperand(1)) { + SDValue Op00 = Op0.getOperand(0); + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); + if (ShAmt < NumSignBits) + return TLO.CombineTo(Op, Op00); + } - // If any of the demanded bits are produced by the sign extension, we also - // demand the input sign bit. - if (OriginalDemandedBits.countLeadingZeros() < ShAmt) - DemandedMask.setSignBit(); + // If any of the demanded bits are produced by the sign extension, we also + // demand the input sign bit. + if (OriginalDemandedBits.countLeadingZeros() < ShAmt) + DemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, - TLO, Depth + 1)) - return true; + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) + return true; - assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); - // If the input sign bit is known to be zero, or if none of the top bits - // are demanded, turn this into an unsigned shift right. - if (Known.Zero[BitWidth - ShAmt - 1] || - OriginalDemandedBits.countLeadingZeros() >= ShAmt) - return TLO.CombineTo( - Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); + // If the input sign bit is known to be zero, or if none of the top bits + // are demanded, turn this into an unsigned shift right. + if (Known.Zero[BitWidth - ShAmt - 1] || + OriginalDemandedBits.countLeadingZeros() >= ShAmt) + return TLO.CombineTo( + Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); - // High bits are known one. - if (Known.One[BitWidth - ShAmt - 1]) - Known.One.setHighBits(ShAmt); - } + // High bits are known one. + if (Known.One[BitWidth - ShAmt - 1]) + Known.One.setHighBits(ShAmt); break; } case X86ISD::PEXTRB: @@ -35005,6 +36121,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( return Vec; break; } + case X86ISD::PCMPGT: + // icmp sgt(0, R) == ashr(R, BitWidth-1). + // iff we only need the sign bit then we can use R directly. + if (DemandedBits.isSignMask() && + ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) + return Op.getOperand(1); + break; } APInt ShuffleUndef, ShuffleZero; @@ -35053,123 +36176,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } -/// Check if a vector extract from a target-specific shuffle of a load can be -/// folded into a single element load. -/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been custom lowered so we need to handle those here. -static SDValue -XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - SDValue InVec = N->getOperand(0); - SDValue EltNo = N->getOperand(1); - EVT EltVT = N->getValueType(0); - - if (!isa<ConstantSDNode>(EltNo)) - return SDValue(); - - EVT OriginalVT = InVec.getValueType(); - unsigned NumOriginalElts = OriginalVT.getVectorNumElements(); - - // Peek through bitcasts, don't duplicate a load with other uses. - InVec = peekThroughOneUseBitcasts(InVec); - - EVT CurrentVT = InVec.getValueType(); - if (!CurrentVT.isVector()) - return SDValue(); - - unsigned NumCurrentElts = CurrentVT.getVectorNumElements(); - if ((NumOriginalElts % NumCurrentElts) != 0) - return SDValue(); - - if (!isTargetShuffle(InVec.getOpcode())) - return SDValue(); - - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - - SmallVector<int, 16> ShuffleMask; - SmallVector<SDValue, 2> ShuffleOps; - bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, - ShuffleOps, ShuffleMask, UnaryShuffle)) - return SDValue(); - - unsigned Scale = NumOriginalElts / NumCurrentElts; - if (Scale > 1) { - SmallVector<int, 16> ScaledMask; - scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask); - ShuffleMask = std::move(ScaledMask); - } - assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch"); - - // Select the input vector, guarding against out of range extract vector. - int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt]; - - if (Idx == SM_SentinelZero) - return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) - : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); - if (Idx == SM_SentinelUndef) - return DAG.getUNDEF(EltVT); - - // Bail if any mask element is SM_SentinelZero - getVectorShuffle below - // won't handle it. - if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) - return SDValue(); - - assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) && - "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1]; - - // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = - (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; - - if (LdNode.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) - return SDValue(); - - AllowedUses = 1; // only allow 1 load use if we have a bitcast - LdNode = LdNode.getOperand(0); - } - - if (!ISD::isNormalLoad(LdNode.getNode())) - return SDValue(); - - LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); - - if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple()) - return SDValue(); - - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( - EltVT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) - return SDValue(); - - // All checks match so transform back to vector_shuffle so that DAG combiner - // can finish the job - SDLoc dl(N); - - // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT) - : DAG.getBitcast(OriginalVT, ShuffleOps[1]); - Shuffle = DAG.getVectorShuffle(OriginalVT, dl, - DAG.getBitcast(OriginalVT, ShuffleOps[0]), - Shuffle, ShuffleMask); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, - EltNo); -} - // Helper to peek through bitops/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>. static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { @@ -35714,7 +36720,7 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const X86Subtarget &Subtarget) { // Find the appropriate width for the PSADBW. EVT InVT = Zext0.getOperand(0).getValueType(); - unsigned RegSize = std::max(128u, InVT.getSizeInBits()); + unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits()); // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we // fill in the missing vector elements with 0. @@ -36263,6 +37269,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); + // We need at least SSE2 to anything here. + if (!Subtarget.hasSSE2()) + return SDValue(); + ISD::NodeType Opc; SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true); @@ -36382,8 +37392,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDLoc dl(InputVector); bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; + unsigned NumSrcElts = SrcVT.getVectorNumElements(); - if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements())) + if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts)) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); // Integer Constant Folding. @@ -36419,14 +37430,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } // TODO - Remove this once we can handle the implicit zero-extension of - // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad, - // combineHorizontalPredicateResult and combineBasicSADPattern. + // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and + // combineBasicSADPattern. return SDValue(); } - if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) - return NewOp; - // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { @@ -36482,7 +37490,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, }; if (all_of(InputVector->uses(), IsBoolExtract) && BoolExtracts.size() > 1) { - unsigned NumSrcElts = SrcVT.getVectorNumElements(); EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); if (SDValue BC = combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { @@ -36568,9 +37575,8 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); - ISD::CondCode NewCC = - ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), - Cond.getOperand(0).getValueType().isInteger()); + ISD::CondCode NewCC = ISD::getSetCCInverse( + cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); @@ -36761,37 +37767,117 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, if (VT.is512BitVector()) return SDValue(); - // TODO: Add other opcodes eventually lowered into BLEND. - for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); - UI != UE; ++UI) - if ((UI->getOpcode() != ISD::VSELECT && - UI->getOpcode() != X86ISD::BLENDV) || - UI.getOperandNo() != 0) + auto OnlyUsedAsSelectCond = [](SDValue Cond) { + for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); + UI != UE; ++UI) + if ((UI->getOpcode() != ISD::VSELECT && + UI->getOpcode() != X86ISD::BLENDV) || + UI.getOperandNo() != 0) + return false; + + return true; + }; + + if (OnlyUsedAsSelectCond(Cond)) { + APInt DemandedMask(APInt::getSignMask(BitWidth)); + KnownBits Known; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true)) return SDValue(); + // If we changed the computation somewhere in the DAG, this change will + // affect all users of Cond. Update all the nodes so that we do not use + // the generic VSELECT anymore. Otherwise, we may perform wrong + // optimizations as we messed with the actual expectation for the vector + // boolean values. + for (SDNode *U : Cond->uses()) { + if (U->getOpcode() == X86ISD::BLENDV) + continue; + + SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), + Cond, U->getOperand(1), U->getOperand(2)); + DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); + DCI.AddToWorklist(U); + } + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(N, 0); + } + + // Otherwise we can still at least try to simplify multiple use bits. APInt DemandedMask(APInt::getSignMask(BitWidth)); + APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements())); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); - if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true)) + if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask, + DemandedElts, DAG, 0)) + return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), + V, N->getOperand(1), N->getOperand(2)); + + return SDValue(); +} + +// Try to match: +// (or (and (M, (sub 0, X)), (pandn M, X))) +// which is a special case of: +// (select M, (sub 0, X), X) +// Per: +// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate +// We know that, if fNegate is 0 or 1: +// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) +// +// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: +// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) +// ( M ? -X : X) == ((X ^ M ) + (M & 1)) +// This lets us transform our vselect to: +// (add (xor X, M), (and M, 1)) +// And further to: +// (sub (xor X, M), M) +static SDValue combineLogicBlendIntoConditionalNegate( + EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + EVT MaskVT = Mask.getValueType(); + assert(MaskVT.isInteger() && + DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && + "Mask must be zero/all-bits"); + + if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) + return SDValue(); + if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) return SDValue(); - // If we changed the computation somewhere in the DAG, this change will - // affect all users of Cond. Update all the nodes so that we do not use - // the generic VSELECT anymore. Otherwise, we may perform wrong - // optimizations as we messed with the actual expectation for the vector - // boolean values. - for (SDNode *U : Cond->uses()) { - if (U->getOpcode() == X86ISD::BLENDV) - continue; + auto IsNegV = [](SDNode *N, SDValue V) { + return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && + ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); + }; - SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), - Cond, U->getOperand(1), U->getOperand(2)); - DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); - DCI.AddToWorklist(U); - } - DCI.CommitTargetLoweringOpt(TLO); - return SDValue(N, 0); + SDValue V; + if (IsNegV(Y.getNode(), X)) + V = X; + else if (IsNegV(X.getNode(), Y)) + V = Y; + else + return SDValue(); + + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); + SDValue SubOp2 = Mask; + + // If the negate was on the false side of the select, then + // the operands of the SUB need to be swapped. PR 27251. + // This is because the pattern being matched above is + // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) + // but if the pattern matched was + // (vselect M, X, (sub (0, X))), that is really negation of the pattern + // above, -(vselect M, (sub 0, X), X), and therefore the replacement + // pattern also needs to be a negation of the replacement pattern above. + // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the + // sub accomplishes the negation of the replacement pattern. + if (V == Y) + std::swap(SubOp1, SubOp2); + + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); } /// Do target-specific dag combines on SELECT and VSELECT nodes. @@ -36811,10 +37897,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()); + + // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M). + // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT + // can't catch, plus vXi8 cases where we'd likely end up with BLENDV. + if (CondVT.isVector() && CondVT.isInteger() && + CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() && + (!CondConstantVector || CondVT.getScalarType() == MVT::i8) && + DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits()) + if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS, + DL, DAG, Subtarget)) + return V; // Convert vselects with constant condition into shuffles. - if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && - DCI.isBeforeLegalizeOps()) { + if (CondConstantVector && DCI.isBeforeLegalizeOps()) { SmallVector<int, 64> Mask; if (createShuffleMaskFromVSELECT(Mask, Cond)) return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); @@ -36843,7 +37940,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; @@ -36854,7 +37951,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMIN; @@ -36873,7 +37970,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMAX; @@ -36883,7 +37980,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; @@ -36911,7 +38008,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) @@ -36922,8 +38019,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMIN; break; @@ -36948,7 +38044,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) @@ -37093,7 +38189,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, SDValue Other; if (ISD::isBuildVectorAllZeros(LHS.getNode())) { Other = RHS; - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, VT.getVectorElementType()); } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { Other = LHS; } @@ -37165,7 +38261,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, SDValue Other; if (ISD::isBuildVectorAllOnes(LHS.getNode())) { Other = RHS; - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, VT.getVectorElementType()); } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { Other = LHS; } @@ -37788,7 +38884,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, } /// Different mul shrinking modes. -enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; +enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { EVT VT = N->getOperand(0).getValueType(); @@ -37809,16 +38905,16 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); // When ranges are from -128 ~ 127, use MULS8 mode. if (MinSignBits >= 25) - Mode = MULS8; + Mode = ShrinkMode::MULS8; // When ranges are from 0 ~ 255, use MULU8 mode. else if (AllPositive && MinSignBits >= 24) - Mode = MULU8; + Mode = ShrinkMode::MULU8; // When ranges are from -32768 ~ 32767, use MULS16 mode. else if (MinSignBits >= 17) - Mode = MULS16; + Mode = ShrinkMode::MULS16; // When ranges are from 0 ~ 65535, use MULU16 mode. else if (AllPositive && MinSignBits >= 16) - Mode = MULU16; + Mode = ShrinkMode::MULU16; else return false; return true; @@ -37888,15 +38984,17 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); - if (Mode == MULU8 || Mode == MULS8) - return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8) + return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND + : ISD::SIGN_EXTEND, DL, VT, MulLo); MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. - SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - ReducedVT, NewN0, NewN1); + SDValue MulHi = + DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. @@ -38294,7 +39392,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. - if (N1SplatC->getAPIntValue() == 1) + if (N1SplatC->isOne()) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } @@ -38546,15 +39644,15 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type"); - assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type"); + assert(N->getOperand(1).getValueType() == MVT::i8 && + "Unexpected shift amount type"); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. - unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue(); + unsigned ShiftVal = N->getConstantOperandVal(1); if (ShiftVal >= NumBitsPerElt) { if (LogicalShift) return DAG.getConstant(0, SDLoc(N), VT); @@ -39094,6 +40192,71 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } + +// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C) +// Where C is a mask containing the same number of bits as the setcc and +// where the setcc will freely 0 upper bits of k-register. We can replace the +// undef in the concat with 0s and remove the AND. This mainly helps with +// v2i1/v4i1 setcc being casted to scalar. +static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); + + EVT VT = N->getValueType(0); + + // Make sure this is an AND with constant. We will check the value of the + // constant later. + if (!isa<ConstantSDNode>(N->getOperand(1))) + return SDValue(); + + // This is implied by the ConstantSDNode. + assert(!VT.isVector() && "Expected scalar VT!"); + + if (N->getOperand(0).getOpcode() != ISD::BITCAST || + !N->getOperand(0).hasOneUse() || + !N->getOperand(0).getOperand(0).hasOneUse()) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Src = N->getOperand(0).getOperand(0); + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 || + !TLI.isTypeLegal(SrcVT)) + return SDValue(); + + if (Src.getOpcode() != ISD::CONCAT_VECTORS) + return SDValue(); + + // We only care about the first subvector of the concat, we expect the + // other subvectors to be ignored due to the AND if we make the change. + SDValue SubVec = Src.getOperand(0); + EVT SubVecVT = SubVec.getValueType(); + + // First subvector should be a setcc with a legal result type. The RHS of the + // AND should be a mask with this many bits. + if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) || + !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements())) + return SDValue(); + + EVT SetccVT = SubVec.getOperand(0).getValueType(); + if (!TLI.isTypeLegal(SetccVT) || + !(Subtarget.hasVLX() || SetccVT.is512BitVector())) + return SDValue(); + + if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32)) + return SDValue(); + + // We passed all the checks. Rebuild the concat_vectors with zeroes + // and cast it back to VT. + SDLoc dl(N); + SmallVector<SDValue, 4> Ops(Src.getNumOperands(), + DAG.getConstant(0, dl, SubVecVT)); + Ops[0] = SubVec; + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, + Ops); + return DAG.getBitcast(VT, Concat); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -39132,9 +40295,12 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) && SrcOps.size() == 1) { SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) + Mask = DAG.getBitcast(MaskVT, SrcOps[0]); if (Mask) { APInt AllBits = APInt::getAllOnesValue(NumElts); return DAG.getSetCC(dl, MVT::i1, Mask, @@ -39143,6 +40309,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, } } + if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget)) + return V; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -39290,68 +40459,6 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { return true; } -// Try to match: -// (or (and (M, (sub 0, X)), (pandn M, X))) -// which is a special case of vselect: -// (vselect M, (sub 0, X), X) -// Per: -// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate -// We know that, if fNegate is 0 or 1: -// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) -// -// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: -// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) -// ( M ? -X : X) == ((X ^ M ) + (M & 1)) -// This lets us transform our vselect to: -// (add (xor X, M), (and M, 1)) -// And further to: -// (sub (xor X, M), M) -static SDValue combineLogicBlendIntoConditionalNegate( - EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, - SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT MaskVT = Mask.getValueType(); - assert(MaskVT.isInteger() && - DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && - "Mask must be zero/all-bits"); - - if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) - return SDValue(); - if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) - return SDValue(); - - auto IsNegV = [](SDNode *N, SDValue V) { - return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && - ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); - }; - - SDValue V; - if (IsNegV(Y.getNode(), X)) - V = X; - else if (IsNegV(X.getNode(), Y)) - V = Y; - else - return SDValue(); - - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); - SDValue SubOp2 = Mask; - - // If the negate was on the false side of the select, then - // the operands of the SUB need to be swapped. PR 27251. - // This is because the pattern being matched above is - // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) - // but if the pattern matched was - // (vselect M, X, (sub (0, X))), that is really negation of the pattern - // above, -(vselect M, (sub 0, X), X), and therefore the replacement - // pattern also needs to be a negation of the replacement pattern above. - // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the - // sub accomplishes the negation of the replacement pattern. - if (V == Y) - std::swap(SubOp1, SubOp2); - - SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); - return DAG.getBitcast(VT, Res); -} - // Try to fold: // (or (and (m, y), (pandn m, x))) // into: @@ -39512,66 +40619,20 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return Ret; } -static SDValue combineOr(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If this is SSE1 only convert to FOR to avoid scalarization. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { - return DAG.getBitcast(MVT::v4i32, - DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, - DAG.getBitcast(MVT::v4f32, N0), - DAG.getBitcast(MVT::v4f32, N1))); - } - - // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. - // TODO: Support multiple SrcOps. - if (VT == MVT::i1) { - SmallVector<SDValue, 2> SrcOps; - if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && - SrcOps.size() == 1) { - SDLoc dl(N); - unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); - EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); - SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); - if (Mask) { - APInt AllBits = APInt::getNullValue(NumElts); - return DAG.getSetCC(dl, MVT::i1, Mask, - DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); - } - } - } - - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) - return R; - - if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) - return FPLogic; - - if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) - return R; - - if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) - return R; - - // Attempt to recursively combine an OR of shuffles. - if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { - SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) - return Res; - } - - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) || + !TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some @@ -39589,11 +40650,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); + EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + SDValue ShAmt0 = N0.getOperand(1); - if (ShAmt0.getValueType() != MVT::i8) + if (ShAmt0.getValueType() != ShiftVT) return SDValue(); SDValue ShAmt1 = N1.getOperand(1); - if (ShAmt1.getValueType() != MVT::i8) + if (ShAmt1.getValueType() != ShiftVT) return SDValue(); // Peek through any modulo shift masks. @@ -39628,12 +40691,12 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, std::swap(ShMsk0, ShMsk1); } - auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1, - SDValue Amt) { + auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1, + SDValue Amt) { if (Opc == ISD::FSHR) std::swap(Op0, Op1); return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt)); + DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt)); }; // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) @@ -39674,7 +40737,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { if (Op1.getOpcode() == InnerShift && isa<ConstantSDNode>(Op1.getOperand(1)) && - Op1.getConstantOperandAPInt(1) == 1) { + Op1.getConstantOperandAPInt(1).isOneValue()) { return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). @@ -39689,6 +40752,70 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineOr(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // If this is SSE1 only convert to FOR to avoid scalarization. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { + return DAG.getBitcast(MVT::v4i32, + DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, + DAG.getBitcast(MVT::v4f32, N0), + DAG.getBitcast(MVT::v4f32, N1))); + } + + // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector<SDValue, 2> SrcOps; + if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) + Mask = DAG.getBitcast(MaskVT, SrcOps[0]); + if (Mask) { + APInt AllBits = APInt::getNullValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); + } + } + } + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) + return R; + + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) + return R; + + if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) + return R; + + if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget)) + return R; + + // Attempt to recursively combine an OR of shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + + return SDValue(); +} + /// Try to turn tests against the signbit in the form of: /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) /// into: @@ -39758,8 +40885,8 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, default: return SDValue(); case MVT::v16i8: case MVT::v8i16: - case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break; - case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break; + case MVT::v4i32: + case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: @@ -39783,7 +40910,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, // Create a greater-than comparison against -1. We don't use the more obvious // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. - return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); + return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT); } /// Detect patterns of truncation with unsigned saturation: @@ -39950,7 +41077,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) { - unsigned TruncOpc; + unsigned TruncOpc = 0; SDValue SatVal; if (auto SSatVal = detectSSatPattern(In, VT)) { SatVal = SSatVal; @@ -40252,6 +41379,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + assert(ML->isUnindexed() && "Unexpected indexed masked load!"); // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. @@ -40279,6 +41407,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + assert(ML->isUnindexed() && "Unexpected indexed masked load!"); if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) return SDValue(); @@ -40314,10 +41443,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. - SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), - ML->getMask(), DAG.getUNDEF(VT), - ML->getMemoryVT(), ML->getMemOperand(), - ML->getExtensionType()); + SDValue NewML = DAG.getMaskedLoad( + VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(), + DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), + ML->getAddressingMode(), ML->getExtensionType()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru()); @@ -40403,8 +41532,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), Mst->getMemoryVT())) { return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); + Mst->getBasePtr(), Mst->getOffset(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), + Mst->getAddressingMode(), true); } return SDValue(); @@ -40593,59 +41723,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, cast<LoadSDNode>(St->getValue())->isSimple() && St->getChain().hasOneUse() && St->isSimple()) { LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode()); - SmallVector<SDValue, 8> Ops; if (!ISD::isNormalLoad(Ld)) return SDValue(); - // If this is not the MMX case, i.e. we are just turning i64 load/store - // into f64 load/store, avoid the transformation if there are multiple - // uses of the loaded value. - if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + // Avoid the transformation if there are multiple uses of the loaded value. + if (!Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); - // If we are a 64-bit capable x86, lower to a single movq load/store pair. - // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store - // pair instead. - if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; - SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), - Ld->getMemOperand()); - - // Make sure new load is placed in same chain order. - DAG.makeEquivalentMemoryOrdering(Ld, NewLd); - return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), - St->getMemOperand()); - } - - // Otherwise, lower to two pairs of 32-bit loads / stores. - SDValue LoAddr = Ld->getBasePtr(); - SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL); - - SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, - Ld->getPointerInfo(), Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, - Ld->getPointerInfo().getWithOffset(4), - MinAlign(Ld->getAlignment(), 4), - Ld->getMemOperand()->getFlags()); - // Make sure new loads are placed in same chain order. - DAG.makeEquivalentMemoryOrdering(Ld, LoLd); - DAG.makeEquivalentMemoryOrdering(Ld, HiLd); - - LoAddr = St->getBasePtr(); - HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); - - SDValue LoSt = - DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(), - St->getAlignment(), St->getMemOperand()->getFlags()); - SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr, - St->getPointerInfo().getWithOffset(4), - MinAlign(St->getAlignment(), 4), - St->getMemOperand()->getFlags()); - return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); + // Lower to a single movq load/store pair. + SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(), + Ld->getBasePtr(), Ld->getMemOperand()); + + // Make sure new load is placed in same chain order. + DAG.makeEquivalentMemoryOrdering(Ld, NewLd); + return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), + St->getMemOperand()); } // This is similar to the above case, but here we handle a scalar 64-bit @@ -41351,23 +42446,25 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { SDValue Op = peekThroughBitcasts(SDValue(N, 0)); EVT VT = Op->getValueType(0); - // Make sure the element size does't change. + + // Make sure the element size doesn't change. if (VT.getScalarSizeInBits() != ScalarSize) return SDValue(); - if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) { + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::VECTOR_SHUFFLE: { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. - if (!SVOp->getOperand(1).isUndef()) + if (!Op.getOperand(1).isUndef()) return SDValue(); - if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1)) + if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1)) if (NegOp0.getValueType() == VT) // FIXME: Can we do better? - return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), - SVOp->getMask()); - return SDValue(); + return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT), + cast<ShuffleVectorSDNode>(Op)->getMask()); + break; } - unsigned Opc = Op.getOpcode(); - if (Opc == ISD::INSERT_VECTOR_ELT) { + case ISD::INSERT_VECTOR_ELT: { // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF, // -V, INDEX). SDValue InsVector = Op.getOperand(0); @@ -41378,34 +42475,35 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, NegInsVal, Op.getOperand(2)); - return SDValue(); + break; } + case ISD::FSUB: + case ISD::XOR: + case X86ISD::FXOR: { + SDValue Op1 = Op.getOperand(1); + SDValue Op0 = Op.getOperand(0); - if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB) - return SDValue(); - - SDValue Op1 = Op.getOperand(1); - SDValue Op0 = Op.getOperand(0); - - // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit - // masks. For FSUB, we have to check if constant bits of Op0 are sign bit - // masks and hence we swap the operands. - if (Opc == ISD::FSUB) - std::swap(Op0, Op1); + // For XOR and FXOR, we want to check if constant + // bits of Op1 are sign bit masks. For FSUB, we + // have to check if constant bits of Op0 are sign + // bit masks and hence we swap the operands. + if (Opc == ISD::FSUB) + std::swap(Op0, Op1); - APInt UndefElts; - SmallVector<APInt, 16> EltBits; - // Extract constant bits and see if they are all sign bit masks. Ignore the - // undef elements. - if (getTargetConstantBitsFromNode(Op1, ScalarSize, - UndefElts, EltBits, - /* AllowWholeUndefs */ true, - /* AllowPartialUndefs */ false)) { - for (unsigned I = 0, E = EltBits.size(); I < E; I++) - if (!UndefElts[I] && !EltBits[I].isSignMask()) - return SDValue(); + APInt UndefElts; + SmallVector<APInt, 16> EltBits; + // Extract constant bits and see if they are all + // sign bit masks. Ignore the undef elements. + if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, + /* AllowWholeUndefs */ true, + /* AllowPartialUndefs */ false)) { + for (unsigned I = 0, E = EltBits.size(); I < E; I++) + if (!UndefElts[I] && !EltBits[I].isSignMask()) + return SDValue(); - return peekThroughBitcasts(Op0); + return peekThroughBitcasts(Op0); + } + } } return SDValue(); @@ -41642,8 +42740,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { return SDValue(); SDValue LHS = N->getOperand(0); - auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC) + if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC) return SDValue(); X86::CondCode NewCC = X86::GetOppositeBranchCondition( @@ -41817,8 +42914,9 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); - // Only perform optimizations if UnsafeMath is used. - if (!DAG.getTarget().Options.UnsafeFPMath) + // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed. + if (!DAG.getTarget().Options.NoNaNsFPMath || + !DAG.getTarget().Options.NoSignedZerosFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes @@ -41943,6 +43041,7 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + // FIXME: Handle strict fp nodes. EVT VT = N->getValueType(0); // Convert a full vector load into vzload when not all bits are needed. @@ -41951,7 +43050,7 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); - LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + LoadSDNode *LN = cast<LoadSDNode>(In); // Unless the load is volatile or atomic. if (LN->isSimple()) { SDLoc dl(N); @@ -42569,6 +43668,44 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a +/// recognizable memcmp expansion. +static bool isOrXorXorTree(SDValue X, bool Root = true) { + if (X.getOpcode() == ISD::OR) + return isOrXorXorTree(X.getOperand(0), false) && + isOrXorXorTree(X.getOperand(1), false); + if (Root) + return false; + return X.getOpcode() == ISD::XOR; +} + +/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp +/// expansion. +template<typename F> +static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG, + EVT VecVT, EVT CmpVT, bool HasPT, F SToV) { + SDValue Op0 = X.getOperand(0); + SDValue Op1 = X.getOperand(1); + if (X.getOpcode() == ISD::OR) { + SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV); + SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV); + if (VecVT != CmpVT) + return DAG.getNode(ISD::OR, DL, CmpVT, A, B); + if (HasPT) + return DAG.getNode(ISD::OR, DL, VecVT, A, B); + return DAG.getNode(ISD::AND, DL, CmpVT, A, B); + } else if (X.getOpcode() == ISD::XOR) { + SDValue A = SToV(Op0); + SDValue B = SToV(Op1); + if (VecVT != CmpVT) + return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE); + if (HasPT) + return DAG.getNode(ISD::XOR, DL, VecVT, A, B); + return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + } + llvm_unreachable("Impossible"); +} + /// Try to map a 128-bit or larger integer comparison to vector instructions /// before type legalization splits it up into chunks. static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, @@ -42589,10 +43726,8 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, // logically-combined vector-sized operands compared to zero. This pattern may // be generated by the memcmp expansion pass with oversized integer compares // (see PR33325). - bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR && - X.getOperand(0).getOpcode() == ISD::XOR && - X.getOperand(1).getOpcode() == ISD::XOR; - if (isNullConstant(Y) && !IsOrXorXorCCZero) + bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X); + if (isNullConstant(Y) && !IsOrXorXorTreeCCZero) return SDValue(); // Don't perform this combine if constructing the vector will be expensive. @@ -42602,66 +43737,102 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, X.getOpcode() == ISD::LOAD; }; if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && - !IsOrXorXorCCZero) + !IsOrXorXorTreeCCZero) return SDValue(); EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); bool HasAVX = Subtarget.hasAVX(); - // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512. + // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. + // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && HasAVX) || (OpSize == 512 && Subtarget.useAVX512Regs())) { bool HasPT = Subtarget.hasSSE41(); + + // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened + // vector registers are essentially free. (Technically, widening registers + // prevents load folding, but the tradeoff is worth it.) + bool PreferKOT = Subtarget.preferMaskRegisters(); + bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512; + EVT VecVT = MVT::v16i8; - EVT CmpVT = MVT::v16i8; - if (OpSize == 256) - VecVT = CmpVT = MVT::v32i8; - if (OpSize == 512) { + EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT; + if (OpSize == 256) { + VecVT = MVT::v32i8; + CmpVT = PreferKOT ? MVT::v32i1 : VecVT; + } + EVT CastVT = VecVT; + bool NeedsAVX512FCast = false; + if (OpSize == 512 || NeedZExt) { if (Subtarget.hasBWI()) { VecVT = MVT::v64i8; CmpVT = MVT::v64i1; + if (OpSize == 512) + CastVT = VecVT; } else { VecVT = MVT::v16i32; CmpVT = MVT::v16i1; + CastVT = OpSize == 512 ? VecVT : + OpSize == 256 ? MVT::v8i32 : MVT::v4i32; + NeedsAVX512FCast = true; + } + } + + auto ScalarToVector = [&](SDValue X) -> SDValue { + bool TmpZext = false; + EVT TmpCastVT = CastVT; + if (X.getOpcode() == ISD::ZERO_EXTEND) { + SDValue OrigX = X.getOperand(0); + unsigned OrigSize = OrigX.getScalarValueSizeInBits(); + if (OrigSize < OpSize) { + if (OrigSize == 128) { + TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8; + X = OrigX; + TmpZext = true; + } else if (OrigSize == 256) { + TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8; + X = OrigX; + TmpZext = true; + } + } } - } + X = DAG.getBitcast(TmpCastVT, X); + if (!NeedZExt && !TmpZext) + return X; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, + DAG.getConstant(0, DL, VecVT), X, + DAG.getConstant(0, DL, VecIdxVT)); + }; SDValue Cmp; - if (IsOrXorXorCCZero) { + if (IsOrXorXorTreeCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne // Use 2 vector equality compares and 'and' the results before doing a // MOVMSK. - SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0)); - SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); - SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); - SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - if (VecVT == CmpVT && HasPT) { - SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B); - SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D); - Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2); - } else { - SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); - } + Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector); } else { - SDValue VecX = DAG.getBitcast(VecVT, X); - SDValue VecY = DAG.getBitcast(VecVT, Y); - if (VecVT == CmpVT && HasPT) { + SDValue VecX = ScalarToVector(X); + SDValue VecY = ScalarToVector(Y); + if (VecVT != CmpVT) { + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE); + } else if (HasPT) { Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); } else { Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } } - // For 512-bits we want to emit a setcc that will lower to kortest. + // AVX512 should emit a setcc that will lower to kortest. if (VecVT != CmpVT) { - EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16; - SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT); - return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC); + EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : + CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16; + return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), + DAG.getConstant(0, DL, KRegVT), CC); } if (HasPT) { SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, @@ -42687,9 +43858,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); + const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + const SDValue LHS = N->getOperand(0); + const SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); EVT OpVT = LHS.getValueType(); SDLoc DL(N); @@ -42716,30 +43887,35 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { - // Put build_vectors on the right. - if (LHS.getOpcode() == ISD::BUILD_VECTOR) { - std::swap(LHS, RHS); - CC = ISD::getSetCCSwappedOperands(CC); + // Using temporaries to avoid messing up operand ordering for later + // transformations if this doesn't work. + SDValue Op0 = LHS; + SDValue Op1 = RHS; + ISD::CondCode TmpCC = CC; + // Put build_vector on the right. + if (Op0.getOpcode() == ISD::BUILD_VECTOR) { + std::swap(Op0, Op1); + TmpCC = ISD::getSetCCSwappedOperands(TmpCC); } bool IsSEXT0 = - (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1); - bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + (Op0.getOpcode() == ISD::SIGN_EXTEND) && + (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1); + bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode()); if (IsSEXT0 && IsVZero1) { - assert(VT == LHS.getOperand(0).getValueType() && + assert(VT == Op0.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETGT) + if (TmpCC == ISD::SETGT) return DAG.getConstant(0, DL, VT); - if (CC == ISD::SETLE) + if (TmpCC == ISD::SETLE) return DAG.getConstant(1, DL, VT); - if (CC == ISD::SETEQ || CC == ISD::SETGE) - return DAG.getNOT(DL, LHS.getOperand(0), VT); + if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE) + return DAG.getNOT(DL, Op0.getOperand(0), VT); - assert((CC == ISD::SETNE || CC == ISD::SETLT) && + assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"); - return LHS.getOperand(0); + return Op0.getOperand(0); } } @@ -42752,8 +43928,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, VT.getVectorElementType() == MVT::i1 && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { - SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, - N->getOperand(2)); + SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC); return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc); } @@ -42985,16 +44160,18 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); - if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || - N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || - VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) + bool IsStrict = N->isStrictFPOpcode(); + SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); + if (!VT.isVector() || Op0->getOpcode() != ISD::AND || + Op0->getOperand(0)->getOpcode() != ISD::SETCC || + VT.getSizeInBits() != Op0.getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. - if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) { + if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); @@ -43004,12 +44181,19 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. - SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + SDValue SourceConst; + if (IsStrict) + SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other}, + {N->getOperand(0), SDValue(BV, 0)}); + else + SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); - SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, - N->getOperand(0)->getOperand(0), MaskConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), + MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); + if (IsStrict) + return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL); return Res; } @@ -43053,7 +44237,8 @@ static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue Op0 = N->getOperand(0); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); @@ -43067,14 +44252,21 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. - if (DAG.SignBitIsZero(Op0)) + if (DAG.SignBitIsZero(Op0)) { + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other}, + {N->getOperand(0), Op0}); return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); + } return SDValue(); } @@ -43084,11 +44276,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. + bool IsStrict = N->isStrictFPOpcode(); if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) return Res; // Now move on to more general possibilities. - SDValue Op0 = N->getOperand(0); + SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); @@ -43100,6 +44293,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } @@ -43117,6 +44313,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), Trunc}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); } // If we're after legalize and the type is v2i32 we need to shuffle and @@ -43125,6 +44324,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, { 0, 2, -1, -1 }); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, + {N->getOperand(0), Shuf}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf); } } @@ -43148,13 +44350,16 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, if (Ld->isSimple() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && LdVT == MVT::i64) { - SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( + std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD( SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); - DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); - return FILDChain; + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second); + return Tmp.first; } } + if (IsStrict) + return SDValue(); + if (SDValue V = combineToFPTruncExtElt(N, DAG)) return V; @@ -43579,7 +44784,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, auto UsePMADDWD = [&](SDValue Op) { ShrinkMode Mode; return Op.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 && + canReduceVMulWidth(Op.getNode(), DAG, Mode) && + Mode != ShrinkMode::MULU16 && (!Subtarget.hasSSE41() || (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && Op->isOnlyUserOf(Op.getOperand(1).getNode()))); @@ -43784,7 +44990,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, // Check if the Mul source can be safely shrunk. ShrinkMode Mode; - if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16) + if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || + Mode == ShrinkMode::MULU16) return SDValue(); auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, @@ -44468,7 +45675,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); - EVT InVecBCVT = InVecBC.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && @@ -44512,31 +45718,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, VT, SDLoc(N), InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); - // Try to move vector bitcast after extract_subv by scaling extraction index: - // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') - // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR - if (InVec != InVecBC && InVecBCVT.isVector()) { - unsigned SrcNumElts = InVecBCVT.getVectorNumElements(); - unsigned DestNumElts = InVecVT.getVectorNumElements(); - if ((DestNumElts % SrcNumElts) == 0) { - unsigned DestSrcRatio = DestNumElts / SrcNumElts; - if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { - unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; - EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - InVecBCVT.getScalarType(), NewExtNumElts); - if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && - TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { - unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; - SDLoc DL(N); - SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); - SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - InVecBC, NewIndex); - return DAG.getBitcast(VT, NewExtract); - } - } - } - } - // If we are extracting from an insert into a zero vector, replace with a // smaller insert into zero if we don't access less than the original // subvector. Don't do this for i1 vectors. @@ -44583,7 +45764,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTUDQ2PD(v4i32). - if (InOpcode == ISD::UINT_TO_FP && + if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() && InVec.getOperand(0).getValueType() == MVT::v4i32) { return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); } @@ -44751,6 +45932,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return DAG.getConstant(0, SDLoc(N), VT); + APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); @@ -44802,8 +45986,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); - case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); - case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); + case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + return combineSIntToFP(N, DAG, DCI, Subtarget); + case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); |