aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
commit706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/X86/X86ISelLowering.cpp
parent7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
downloadsrc-706b4fc47bbc608932d3b491ae19a3b9cde9497b.tar.gz
src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.zip
Vendor import of llvm-project master e26a78e70, the last commit beforevendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
the llvmorg-11-init tag, from which release/10.x was branched.
Notes
Notes: svn path=/vendor/llvm-project/master/; revision=356843 svn path=/vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085/; revision=356844; tag=vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp4018
1 files changed, 2603 insertions, 1415 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ed975e9248a8..0f152968ddfd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25,7 +25,9 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -154,17 +156,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
}
- if (Subtarget.isTargetDarwin()) {
- // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
- setUseUnderscoreSetJmp(false);
- setUseUnderscoreLongJmp(false);
- } else if (Subtarget.isTargetWindowsGNU()) {
- // MS runtime is weird: it exports _setjmp, but longjmp!
- setUseUnderscoreSetJmp(true);
- setUseUnderscoreLongJmp(false);
- } else {
- setUseUnderscoreSetJmp(true);
- setUseUnderscoreLongJmp(true);
+ if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+ // MSVCRT doesn't have powi; fall back to pow
+ setLibcallName(RTLIB::POWI_F32, nullptr);
+ setLibcallName(RTLIB::POWI_F64, nullptr);
}
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
@@ -217,72 +212,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ShiftOp , MVT::i64 , Custom);
}
- // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
- // operation.
- setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
-
if (!Subtarget.useSoftFloat()) {
- // We have an algorithm for SSE2->double, and we turn this into a
- // 64-bit FILD followed by conditional FADD for other targets.
- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
- }
-
- // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
- // this operation.
- setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
- setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
-
- if (!Subtarget.useSoftFloat()) {
- // SSE has no i16 to fp conversion, only i32.
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
- }
- } else {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
- }
-
- // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
- // this operation.
- setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
- setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
-
- if (!Subtarget.useSoftFloat()) {
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
+
+ // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
+ // SSE has no i16 to fp conversion, only i32. We promote in the handler
+ // to allow f80 to use i16 and f64 to use i16 with sse1 only
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
+ // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
- setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
-
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
- }
-
- // Handle FP_TO_UINT by promoting the destination to a larger signed
- // conversion.
- setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
- setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
- setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
-
- if (!Subtarget.useSoftFloat()) {
- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
- }
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+
+ // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+ }
+
+ // Handle address space casts between mixed sized pointers.
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
@@ -409,12 +401,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
- // These should be promoted to a larger select which is supported.
- setOperationAction(ISD::SELECT , MVT::i1 , Promote);
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
}
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
@@ -619,6 +611,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
+ // Handle constrained floating-point operations of scalar.
+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
// We don't support FMA.
setOperationAction(ISD::FMA, MVT::f64, Expand);
@@ -659,6 +665,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
setOperationAction(ISD::LRINT, MVT::f80, Expand);
setOperationAction(ISD::LLRINT, MVT::f80, Expand);
+
+ // Handle constrained floating-point operations of scalar.
+ setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+ // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
+ // as Custom.
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
}
// f128 uses xmm registers, but most operations require libcalls.
@@ -668,22 +685,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
- setOperationAction(ISD::FADD, MVT::f128, Custom);
- setOperationAction(ISD::FSUB, MVT::f128, Custom);
- setOperationAction(ISD::FDIV, MVT::f128, Custom);
- setOperationAction(ISD::FMUL, MVT::f128, Custom);
- setOperationAction(ISD::FMA, MVT::f128, Expand);
+ setOperationAction(ISD::FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::FMA, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
setOperationAction(ISD::FABS, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
- setOperationAction(ISD::FSIN, MVT::f128, Expand);
- setOperationAction(ISD::FCOS, MVT::f128, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
- setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::FSIN, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
+ setOperationAction(ISD::FCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
+ // No STRICT_FSINCOS
+ setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
// We need to custom handle any FP_ROUND with an f128 input, but
// LegalizeDAG uses the result type to know when to run a custom handler.
// So we have to list all legal floating point result types here.
@@ -820,12 +847,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -895,6 +925,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
@@ -933,37 +965,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
-
- // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
- // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
- // split again based on the input type, this will cause an AssertSExt i16 to
- // be emitted instead of an AssertZExt. This will allow packssdw followed by
- // packuswb to be used to truncate to v8i8. This is necessary since packusdw
- // isn't available until sse4.1.
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+ for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
+ }
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
@@ -1008,6 +1041,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// With AVX512, expanding (and promoting the shifts) is better.
if (!Subtarget.hasAVX512())
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -1029,11 +1068,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
- setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
- setOperationAction(ISD::FCEIL, RoundedTy, Legal);
- setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
- setOperationAction(ISD::FRINT, RoundedTy, Legal);
- setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
}
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
@@ -1072,6 +1116,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// i8 vectors are custom because the source register and source
// source memory operand types are not the same width.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+
+ if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
+ // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
+ // do the pre and post work in the vector domain.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
+ // We need to mark SINT_TO_FP as Custom even though we want to expand it
+ // so that DAG combine doesn't try to turn it into uint_to_fp.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
@@ -1105,25 +1160,45 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
: &X86::VR256RegClass);
for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
- setOperationAction(ISD::FNEG, VT, Custom);
- setOperationAction(ISD::FABS, VT, Custom);
- setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
-
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
@@ -1169,6 +1244,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
@@ -1180,8 +1257,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
- MVT::v2f64, MVT::v4f64 })
+ MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
+ }
}
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1233,6 +1312,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
@@ -1299,12 +1379,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
@@ -1331,6 +1417,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Custom);
@@ -1372,21 +1460,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
- setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
- setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
-
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom);
+ for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
+ setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
+ }
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
@@ -1420,11 +1524,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
}
@@ -1459,6 +1568,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
@@ -1470,8 +1581,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
@@ -1532,13 +1647,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
- // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
@@ -1563,12 +1690,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI()) {
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::SINT_TO_FP, VT, Legal);
- setOperationAction(ISD::UINT_TO_FP, VT, Legal);
- setOperationAction(ISD::FP_TO_SINT, VT, Legal);
- setOperationAction(ISD::FP_TO_UINT, VT, Legal);
-
- setOperationAction(ISD::MUL, VT, Legal);
+ setOperationAction(ISD::SINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MUL, VT, Legal);
}
}
@@ -1739,12 +1877,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
- setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
}
if (Subtarget.hasBWI()) {
@@ -1828,8 +1968,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.is32Bit() &&
(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
- {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
- ISD::FLOG10, ISD::FPOW, ISD::FSIN})
+ {ISD::FCEIL, ISD::STRICT_FCEIL,
+ ISD::FCOS, ISD::STRICT_FCOS,
+ ISD::FEXP, ISD::STRICT_FEXP,
+ ISD::FFLOOR, ISD::STRICT_FFLOOR,
+ ISD::FREM, ISD::STRICT_FREM,
+ ISD::FLOG, ISD::STRICT_FLOG,
+ ISD::FLOG10, ISD::STRICT_FLOG10,
+ ISD::FPOW, ISD::STRICT_FPOW,
+ ISD::FSIN, ISD::STRICT_FSIN})
if (isOperationExpand(Op, MVT::f32))
setOperationAction(Op, MVT::f32, Promote);
@@ -1870,6 +2017,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
+ setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
@@ -1901,6 +2050,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setPrefFunctionAlignment(Align(16));
verifyIntrinsicTables();
+
+ // Default to having -disable-strictnode-mutation on
+ IsStrictFPEnabled = true;
}
// This has so far only been implemented for 64-bit MachO.
@@ -1910,7 +2062,7 @@ bool X86TargetLowering::useLoadStackGuardNode() const {
bool X86TargetLowering::useStackGuardXorFP() const {
// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
- return Subtarget.getTargetTriple().isOSMSVCRT();
+ return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
}
SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
@@ -1946,9 +2098,13 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return MVT::i8;
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall)
+ return MVT::v32i1;
// FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
- Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
+ Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return MVT::v16i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -1966,9 +2122,13 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return VT.getVectorNumElements();
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall)
+ return 2;
// FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
- Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
+ Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -1988,6 +2148,15 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall) {
+ RegisterVT = MVT::v32i1;
+ IntermediateVT = MVT::v32i1;
+ NumIntermediates = 2;
+ return 2;
+ }
+
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
NumIntermediates, RegisterVT);
}
@@ -2383,6 +2552,10 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
+ const TargetMachine &TM = getTargetMachine();
+ if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
+ return false;
+
return SrcAS < 256 && DestAS < 256;
}
@@ -2520,18 +2693,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.getLocInfo() != CCValAssign::FPExt &&
"Unexpected FP-extend for return value.");
- // If this is x86-64, and we disabled SSE, we can't return FP values,
- // or SSE or MMX vectors.
- if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
- VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+ // Report an error if we have attempted to return a value via an XMM
+ // register and SSE was disabled.
+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
- } else if (ValVT == MVT::f64 &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
- // Likewise we can't return F64 values with SSE1 only. gcc does so, but
- // llvm-gcc has never done it right and no one has noticed, so this
- // should be OK for now.
+ } else if (!Subtarget.hasSSE2() &&
+ X86::FR64XRegClass.contains(VA.getLocReg()) &&
+ ValVT == MVT::f64) {
+ // When returning a double via an XMM register, report an error if SSE2 is
+ // not enabled.
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
@@ -2826,7 +2997,6 @@ SDValue X86TargetLowering::LowerCallResult(
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- bool Is64Bit = Subtarget.is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
@@ -2845,15 +3015,22 @@ SDValue X86TargetLowering::LowerCallResult(
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
}
- // If this is x86-64, and we disabled SSE, we can't return FP values
- if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
- ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
+ // Report an error if there was an attempt to return FP values via XMM
+ // registers.
+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
- VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
- } else if (CopyVT == MVT::f64 &&
- (Is64Bit && !Subtarget.hasSSE2())) {
+ if (VA.getLocReg() == X86::XMM1)
+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
+ else
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (!Subtarget.hasSSE2() &&
+ X86::FR64XRegClass.contains(VA.getLocReg()) &&
+ CopyVT == MVT::f64) {
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
- VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ if (VA.getLocReg() == X86::XMM1)
+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
+ else
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2895,6 +3072,9 @@ SDValue X86TargetLowering::LowerCallResult(
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
}
+ if (VA.getLocInfo() == CCValAssign::BCvt)
+ Val = DAG.getBitcast(VA.getValVT(), Val);
+
InVals.push_back(Val);
}
@@ -2993,9 +3173,7 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
- auto Attr =
- CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
- if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ if (!CI->isTailCall())
return false;
ImmutableCallSite CS(CI);
@@ -3464,8 +3642,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
FuncInfo->getForwardedMustTailRegParms();
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
- // Conservatively forward AL on x86_64, since it might be used for varargs.
- if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ // Forward AL for SysV x86_64 targets, since it is used for varargs.
+ if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
}
@@ -3618,7 +3796,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
- auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
@@ -3634,9 +3811,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
- if (Attr.getValueAsString() == "true")
- isTailCall = false;
-
if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
@@ -3728,7 +3902,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
"the only memory argument");
}
- if (!IsSibcall)
+ if (!IsSibcall && !IsMustTail)
Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
NumBytes - NumBytesToPush, dl);
@@ -4013,7 +4187,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
- if (!IsSibcall && isTailCall) {
+ if (!IsSibcall && isTailCall && !IsMustTail) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
@@ -4183,23 +4357,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
/// requirement.
unsigned
-X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
- SelectionDAG& DAG) const {
- const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- unsigned StackAlignment = TFI.getStackAlignment();
- uint64_t AlignMask = StackAlignment - 1;
- int64_t Offset = StackSize;
- unsigned SlotSize = RegInfo->getSlotSize();
- if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
- // Number smaller than 12 so just add the difference.
- Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
- } else {
- // Mask out lower bits, add stackalignment once plus the 12 bytes.
- Offset = ((~AlignMask) & Offset) + StackAlignment +
- (StackAlignment-SlotSize);
- }
- return Offset;
+X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
+ SelectionDAG &DAG) const {
+ const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
+ const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
+ assert(StackSize % SlotSize == 0 &&
+ "StackSize must be a multiple of SlotSize");
+ return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
}
/// Return true if the given stack call argument is already available in the
@@ -4643,8 +4807,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
}
}
-/// Return true if the condition is an unsigned comparison operation.
-static bool isX86CCUnsigned(unsigned X86CC) {
+/// Return true if the condition is an signed comparison operation.
+static bool isX86CCSigned(unsigned X86CC) {
switch (X86CC) {
default:
llvm_unreachable("Invalid integer condition!");
@@ -4654,12 +4818,12 @@ static bool isX86CCUnsigned(unsigned X86CC) {
case X86::COND_A:
case X86::COND_BE:
case X86::COND_AE:
- return true;
+ return false;
case X86::COND_G:
case X86::COND_GE:
case X86::COND_L:
case X86::COND_LE:
- return false;
+ return true;
}
}
@@ -4700,7 +4864,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
// X >= 0 -> X == 0, jump on !sign.
return X86::COND_NS;
}
- if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {
+ if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
@@ -4949,12 +5113,6 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
}
-bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
- bool IsSigned) const {
- // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
- return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
-}
-
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
@@ -5334,15 +5492,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
static bool canWidenShuffleElements(ArrayRef<int> Mask,
const APInt &Zeroable,
+ bool V2IsZero,
SmallVectorImpl<int> &WidenedMask) {
- SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
- for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
- if (TargetMask[i] == SM_SentinelUndef)
- continue;
- if (Zeroable[i])
- TargetMask[i] = SM_SentinelZero;
+ // Create an alternative mask with info about zeroable elements.
+ // Here we do not set undef elements as zeroable.
+ SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ if (V2IsZero) {
+ assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+ for (int i = 0, Size = Mask.size(); i != Size; ++i)
+ if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ ZeroableMask[i] = SM_SentinelZero;
}
- return canWidenShuffleElements(TargetMask, WidenedMask);
+ return canWidenShuffleElements(ZeroableMask, WidenedMask);
}
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
@@ -5764,11 +5925,29 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- // Clear the upper bits of the subvector and move it to its insert position.
unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+
+ // Do an optimization for the the most frequently used types.
+ if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
+ APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
+ Mask0.flipAllBits();
+ SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
+ SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
+ Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+
+ // Reduce to original width if needed.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ // Clear the upper bits of the subvector and move it to its insert position.
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
- unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
@@ -5850,7 +6029,7 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
"Expected VTs to be the same size!");
unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
In = extractSubVector(In, 0, DAG, DL,
- std::max(128U, VT.getSizeInBits() / Scale));
+ std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
InVT = In.getValueType();
}
@@ -6719,9 +6898,97 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return true;
}
+/// Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static void computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ APInt &KnownUndef, APInt &KnownZero) {
+ int Size = Mask.size();
+ KnownUndef = KnownZero = APInt::getNullValue(Size);
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ int VectorSizeInBits = V1.getValueSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Size;
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+ for (int i = 0; i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0) {
+ KnownUndef.setBit(i);
+ continue;
+ }
+ if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ KnownZero.setBit(i);
+ continue;
+ }
+
+ // Determine shuffle input and normalize the mask.
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+ // the (larger) source element must be UNDEF/ZERO.
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef())
+ KnownUndef.setBit(i);
+ if (X86::isZeroNode(Op))
+ KnownZero.setBit(i);
+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt Val = Cst->getAPIntValue();
+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
+ if (Val == 0)
+ KnownZero.setBit(i);
+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt Val = Cst->getValueAPF().bitcastToAPInt();
+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
+ if (Val == 0)
+ KnownZero.setBit(i);
+ }
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be UNDEF or ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllUndef = true;
+ bool AllZero = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllUndef &= Op.isUndef();
+ AllZero &= X86::isZeroNode(Op);
+ }
+ if (AllUndef)
+ KnownUndef.setBit(i);
+ if (AllZero)
+ KnownZero.setBit(i);
+ continue;
+ }
+ }
+}
+
/// Decode a target shuffle mask and inputs and see if any values are
/// known to be undef or zero from their inputs.
/// Returns true if the target shuffle mask was decoded.
+/// FIXME: Merge this with computeZeroableShuffleElements?
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
APInt &KnownUndef, APInt &KnownZero) {
@@ -6741,7 +7008,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
- assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+ assert((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type");
unsigned EltSizeInBits = VT.getSizeInBits() / Size;
@@ -6810,7 +7077,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
// Replace target shuffle mask elements with known undef/zero sentinels.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
const APInt &KnownUndef,
- const APInt &KnownZero) {
+ const APInt &KnownZero,
+ bool ResolveKnownZeros= true) {
unsigned NumElts = Mask.size();
assert(KnownUndef.getBitWidth() == NumElts &&
KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
@@ -6818,7 +7086,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
for (unsigned i = 0; i != NumElts; ++i) {
if (KnownUndef[i])
Mask[i] = SM_SentinelUndef;
- else if (KnownZero[i])
+ else if (ResolveKnownZeros && KnownZero[i])
Mask[i] = SM_SentinelZero;
}
}
@@ -8306,7 +8574,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -8552,7 +8820,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
} else {
- MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));
+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
DstVec = DAG.getBitcast(VecVT, Imm);
@@ -10130,13 +10398,18 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) {
return true;
}
-/// Test whether there are elements crossing 128-bit lanes in this
+/// Test whether there are elements crossing LaneSizeInBits lanes in this
/// shuffle mask.
///
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
/// and we routinely test for these.
-static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
- int LaneSize = 128 / VT.getScalarSizeInBits();
+static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask) {
+ assert(LaneSizeInBits && ScalarSizeInBits &&
+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+ "Illegal shuffle lane size");
+ int LaneSize = LaneSizeInBits / ScalarSizeInBits;
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
@@ -10144,6 +10417,12 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return false;
}
+/// Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
+}
+
/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
@@ -10424,84 +10703,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
-/// Compute whether each element of a shuffle is zeroable.
-///
-/// A "zeroable" vector shuffle element is one which can be lowered to zero.
-/// Either it is an undef element in the shuffle mask, the element of the input
-/// referenced is undef, or the element of the input referenced is known to be
-/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
-/// as many lanes with this technique as possible to simplify the remaining
-/// shuffle.
-static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
- SDValue V1, SDValue V2) {
- APInt Zeroable(Mask.size(), 0);
- V1 = peekThroughBitcasts(V1);
- V2 = peekThroughBitcasts(V2);
-
- bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
- int VectorSizeInBits = V1.getValueSizeInBits();
- int ScalarSizeInBits = VectorSizeInBits / Mask.size();
- assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
-
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- int M = Mask[i];
- // Handle the easy cases.
- if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
- Zeroable.setBit(i);
- continue;
- }
-
- // Determine shuffle input and normalize the mask.
- SDValue V = M < Size ? V1 : V2;
- M %= Size;
-
- // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
- if (V.getOpcode() != ISD::BUILD_VECTOR)
- continue;
-
- // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
- // the (larger) source element must be UNDEF/ZERO.
- if ((Size % V.getNumOperands()) == 0) {
- int Scale = Size / V->getNumOperands();
- SDValue Op = V.getOperand(M / Scale);
- if (Op.isUndef() || X86::isZeroNode(Op))
- Zeroable.setBit(i);
- else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
- APInt Val = Cst->getAPIntValue();
- Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
- Val = Val.getLoBits(ScalarSizeInBits);
- if (Val == 0)
- Zeroable.setBit(i);
- } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
- APInt Val = Cst->getValueAPF().bitcastToAPInt();
- Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
- Val = Val.getLoBits(ScalarSizeInBits);
- if (Val == 0)
- Zeroable.setBit(i);
- }
- continue;
- }
-
- // If the BUILD_VECTOR has more elements then all the (smaller) source
- // elements must be UNDEF or ZERO.
- if ((V.getNumOperands() % Size) == 0) {
- int Scale = V->getNumOperands() / Size;
- bool AllZeroable = true;
- for (int j = 0; j < Scale; ++j) {
- SDValue Op = V.getOperand((M * Scale) + j);
- AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
- }
- if (AllZeroable)
- Zeroable.setBit(i);
- continue;
- }
- }
-
- return Zeroable;
-}
-
// The Shuffle result is as follow:
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
// Each Zeroable's element correspond to a particular Mask's element.
@@ -10616,11 +10817,11 @@ static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
-static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &UnpackOpcode, bool IsUnary,
- ArrayRef<int> TargetMask,
- const SDLoc &DL, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &UnpackOpcode, bool IsUnary,
+ ArrayRef<int> TargetMask, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
@@ -10728,8 +10929,8 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
-static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
- int Delta) {
+static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
+ int Delta) {
int Size = (int)Mask.size();
int Split = Size / Delta;
int TruncatedVectorStart = SwappedOps ? Size : 0;
@@ -10814,8 +11015,8 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
// The first half/quarter of the mask should refer to every second/fourth
// element of the vector truncated and bitcasted.
- if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
- !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
+ if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
+ !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
return SDValue();
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
@@ -10823,11 +11024,10 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
-static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
- SDValue &V2, unsigned &PackOpcode,
- ArrayRef<int> TargetMask,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
+ unsigned &PackOpcode, ArrayRef<int> TargetMask,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
@@ -10880,8 +11080,8 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
- if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
- Subtarget))
+ if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+ Subtarget))
return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
DAG.getBitcast(PackVT, V2));
@@ -10972,10 +11172,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG);
-static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
- MutableArrayRef<int> Mask,
- const APInt &Zeroable, bool &ForceV1Zero,
- bool &ForceV2Zero, uint64_t &BlendMask) {
+static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
+ MutableArrayRef<int> Mask,
+ const APInt &Zeroable, bool &ForceV1Zero,
+ bool &ForceV2Zero, uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
@@ -11038,8 +11238,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 64> Mask(Original.begin(), Original.end());
- if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
- BlendMask))
+ if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
+ BlendMask))
return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
@@ -11161,7 +11361,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v32i16:
case MVT::v64i8: {
// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize) {
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
@@ -11609,9 +11809,11 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
}
/// Try to lower a vector shuffle as a byte shift sequence.
-static SDValue lowerVectorShuffleAsByteShiftMask(
- const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
assert(VT.is128BitVector() && "Only 128-bit vectors supported");
@@ -14056,8 +14258,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return BitBlend;
// Try to use byte shift instructions to mask.
- if (SDValue V = lowerVectorShuffleAsByteShiftMask(
- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
// Try to lower by permuting the inputs into an unpack instruction.
@@ -14318,8 +14520,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Try to use byte shift instructions to mask.
- if (SDValue V = lowerVectorShuffleAsByteShiftMask(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -14686,6 +14888,36 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
DAG);
}
+// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+// TODO: Extend to support v8f32 (+ 512-bit shuffles).
+static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
+
+ int LHSMask[4] = {-1, -1, -1, -1};
+ int RHSMask[4] = {-1, -1, -1, -1};
+ unsigned SHUFPMask = 0;
+
+ // As SHUFPD uses a single LHS/RHS element per lane, we can always
+ // perform the shuffle once the lanes have been shuffled in place.
+ for (int i = 0; i != 4; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ int LaneBase = i & ~1;
+ auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
+ LaneMask[LaneBase + (M & 1)] = M;
+ SHUFPMask |= (M & 1) << i;
+ }
+
+ SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
+ SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
+ DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
+}
+
/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a lane permutation followed by a per-lane permutation.
///
@@ -14764,13 +14996,22 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
int Size = Mask.size();
int LaneSize = Size / 2;
+ // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+ // Only do this if the elements aren't all from the lower lane,
+ // otherwise we're (probably) better off doing a split.
+ if (VT == MVT::v4f64 &&
+ !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
+ if (SDValue V =
+ lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
if (!Subtarget.hasAVX2()) {
bool LaneCrossing[2] = {false, false};
for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
@@ -14778,7 +15019,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
- LaneUsed[(Mask[i] / LaneSize)] = true;
+ LaneUsed[(Mask[i] % Size) / LaneSize] = true;
if (!LaneUsed[0] || !LaneUsed[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
@@ -14817,8 +15058,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
+ bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
+
SmallVector<int, 4> WidenedMask;
- if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
+ if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
return SDValue();
bool IsLowZero = (Zeroable & 0x3) == 0x3;
@@ -15637,6 +15880,18 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Op;
+ // If we have lane crossing shuffles AND they don't all come from the lower
+ // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+ // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
+ // canonicalize to a blend of splat which isn't necessary for this combine.
+ if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
+ !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
+ (V1.getOpcode() != ISD::BUILD_VECTOR) &&
+ (V2.getOpcode() != ISD::BUILD_VECTOR))
+ if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
+ Mask, DAG))
+ return Op;
+
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
@@ -16950,6 +17205,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
break;
case MVT::v64i1:
+ // Fall back to scalarization. FIXME: We can do better if the shuffle
+ // can be partitioned cleanly.
+ if (!Subtarget.useBWIRegs())
+ return SDValue();
ExtVT = MVT::v64i8;
break;
}
@@ -17039,8 +17298,8 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
/// above in helper routines. The canonicalization attempts to widen shuffles
/// to involve fewer lanes of wider elements, consolidate symmetric patterns
/// s.t. only one of the two inputs needs to be tested, etc.
-static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> OrigMask = SVOp->getMask();
SDValue V1 = Op.getOperand(0);
@@ -17086,29 +17345,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
- APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
+ APInt KnownUndef, KnownZero;
+ computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
+
+ APInt Zeroable = KnownUndef | KnownZero;
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
- // Create an alternative mask with info about zeroable elements.
- // Here we do not set undef elements as zeroable.
- SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
- if (V2IsZero) {
- assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
- for (int i = 0; i != NumElements; ++i)
- if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
- ZeroableMask[i] = SM_SentinelZero;
- }
-
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
- canWidenShuffleElements(ZeroableMask, WidenedMask)) {
+ canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
// Shuffle mask widening should not interfere with a broadcast opportunity
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
@@ -18307,7 +18559,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
@@ -18328,8 +18580,13 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Op.getOpcode() == ISD::SINT_TO_FP ||
- Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
- SDValue Src = Op.getOperand(0);
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
+ Op.getOpcode() == ISD::UINT_TO_FP) &&
+ "Unexpected opcode!");
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
@@ -18346,7 +18603,17 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
+ if (IsStrict) {
+ SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
+ {Op.getOperand(0), InVec});
+ SDValue Chain = CvtVec.getValue(1);
+ SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Value, Chain}, dl);
+ }
+
SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
DAG.getIntPtrConstant(0, dl));
}
@@ -18415,44 +18682,157 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
+static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
+
+ if (Subtarget.hasDQI()) {
+ assert(!Subtarget.hasVLX() && "Unexpected features");
+
+ assert((Src.getSimpleValueType() == MVT::v2i64 ||
+ Src.getSimpleValueType() == MVT::v4i64) &&
+ "Unsupported custom type");
+
+ // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
+ assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
+ "Unexpected VT!");
+ MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
+
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
+ : DAG.getUNDEF(MVT::v8i64);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+ }
+
+ bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
+ Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
+ if (VT != MVT::v4f32 || IsSigned)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
+ SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
+ SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
+ DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
+ DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
+ SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
+ SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
+ SmallVector<SDValue, 4> SignCvts(4);
+ SmallVector<SDValue, 4> Chains(4);
+ for (int i = 0; i != 4; ++i) {
+ SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
+ DAG.getIntPtrConstant(i, DL));
+ if (IsStrict) {
+ SignCvts[i] =
+ DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), Src});
+ Chains[i] = SignCvts[i].getValue(1);
+ } else {
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
+ }
+ }
+ SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
+
+ SDValue Slow, Chain;
+ if (IsStrict) {
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
+ {Chain, SignCvt, SignCvt});
+ Chain = Slow.getValue(1);
+ } else {
+ Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
+ }
+
+ IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
+ SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cvt, Chain}, DL);
+
+ return Cvt;
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- SDValue Src = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
+ SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- if (VT == MVT::f128)
- return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
-
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+ // Note: Since v2f64 is a legal type. We don't need to zero extend the
+ // source for strict FP.
+ if (IsStrict)
+ return DAG.getNode(
+ X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
+ {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(SrcVT))});
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
DAG.getUNDEF(SrcVT)));
}
+ if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
+
return SDValue();
}
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
+ bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
+
// These are really Legal; return the operand so the caller accepts it as
// Legal.
- if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
+ if (SrcVT == MVT::i32 && UseSSEReg)
return Op;
- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
+ if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
return Op;
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
- SDValue ValueToStore = Op.getOperand(0);
- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
- !Subtarget.is64Bit())
+ // SSE doesn't have an i16 conversion so we need to promote.
+ if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {Chain, Ext});
+
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
+ }
+
+ if (VT == MVT::f128)
+ return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+
+ SDValue ValueToStore = Src;
+ if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
@@ -18463,13 +18843,18 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
auto PtrVT = getPointerTy(MF.getDataLayout());
int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- SDValue Chain = DAG.getStore(
- DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+ Chain = DAG.getStore(
+ Chain, dl, ValueToStore, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
- return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+ std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
+
+ return Tmp.first;
}
-SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue StackSlot,
SelectionDAG &DAG) const {
// Build the FILD
@@ -18498,9 +18883,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue Result =
DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
Tys, FILDOps, SrcVT, LoadMMO);
+ Chain = Result.getValue(1);
if (useSSE) {
- Chain = Result.getValue(1);
SDValue InFlag = Result.getValue(2);
// FIXME: Currently the FST is glued to the FILD_FLAG. This
@@ -18522,9 +18907,10 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+ Chain = Result.getValue(1);
}
- return Result;
+ return { Result, Chain };
}
/// Horizontal vector math instructions may be slower than normal math with
@@ -18532,7 +18918,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool IsOptimizingSize = DAG.shouldOptForSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
@@ -18553,6 +18939,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
#endif
*/
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
@@ -18573,8 +18961,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
// Load the 64-bit value into an XMM register.
- SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
- Op.getOperand(0));
+ SDValue XR1 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
SDValue CLod0 =
DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
@@ -18587,51 +18975,81 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
/* Alignment = */ 16);
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+ SDValue Sub;
+ SDValue Chain;
// TODO: Are there any fast-math-flags to propagate here?
- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ if (IsStrict) {
+ Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), XR2F, CLod1});
+ Chain = Sub.getValue(1);
+ } else
+ Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ if (!IsStrict && Subtarget.hasSSE3() &&
+ shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ // FIXME: Do we need a STRICT version of FHADD?
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
- Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+ if (IsStrict) {
+ Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
+ {Chain, Shuffle, Sub});
+ Chain = Result.getValue(1);
+ } else
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Result, Chain}, dl);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
- DAG.getIntPtrConstant(0, dl));
+ return Result;
}
/// 32-bit unsigned integer to float expansion.
static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
SDLoc dl(Op);
// FP constant to bias correct the final result.
SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
MVT::f64);
// Load the 32-bit value into an XMM register.
- SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
- Op.getOperand(0));
+ SDValue Load =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
// Zero out the upper parts of the register.
Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
- Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
- DAG.getBitcast(MVT::v2f64, Load),
- DAG.getIntPtrConstant(0, dl));
-
// Or the load with the bias.
SDValue Or = DAG.getNode(
ISD::OR, dl, MVT::v2i64,
- DAG.getBitcast(MVT::v2i64,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+ DAG.getBitcast(MVT::v2i64, Load),
DAG.getBitcast(MVT::v2i64,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
Or =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+ if (Op.getNode()->isStrictFPOpcode()) {
+ // Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Chain = Op.getOperand(0);
+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
+ {Chain, Or, Bias});
+
+ if (Op.getValueType() == Sub.getValueType())
+ return Sub;
+
+ // Handle final rounding.
+ std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
+ Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
+
+ return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
+ }
+
// Subtract the bias.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
@@ -18646,38 +19064,123 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
if (Op.getSimpleValueType() != MVT::v2f64)
return SDValue();
- SDValue N0 = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+
+ SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
- // Legalize to v4i32 type.
- N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
- DAG.getUNDEF(MVT::v2i32));
+ if (Subtarget.hasAVX512()) {
+ if (!Subtarget.hasVLX()) {
+ // Let generic type legalization widen this.
+ if (!IsStrict)
+ return SDValue();
+ // Otherwise pad the integer input with 0s and widen the operation.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getConstant(0, DL, MVT::v2i32));
+ SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
+ {Op.getOperand(0), N0});
+ SDValue Chain = Res.getValue(1);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getMergeValues({Res, Chain}, DL);
+ }
- if (Subtarget.hasAVX512())
+ // Legalize to v4i32 type.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), N0});
return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+ }
- // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
- // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
- SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
- SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
-
- // Two to the power of half-word-size.
- SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
-
- // Clear upper part of LO, lower HI.
- SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
- SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
-
- SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
- fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
- SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
+ // Zero extend to 2i64, OR with the floating point representation of 2^52.
+ // This gives us the floating point equivalent of 2^52 + the i32 integer
+ // since double has 52-bits of mantissa. Then subtract 2^52 in floating
+ // point leaving just our i32 integers in double format.
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
+ SDValue VBias =
+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
- // Add the two halves.
- return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), Or, VBias});
+ return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
}
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue V = Op->getOperand(IsStrict ? 1 : 0);
+ MVT VecIntVT = V.getSimpleValueType();
+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+ "Unsupported custom type");
+
+ if (Subtarget.hasAVX512()) {
+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+ assert(!Subtarget.hasVLX() && "Unexpected features");
+ MVT VT = Op->getSimpleValueType(0);
+
+ // v8i32->v8f64 is legal with AVX512 so just return it.
+ if (VT == MVT::v8f64)
+ return Op;
+
+ assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
+ "Unexpected VT!");
+ MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
+ MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ SDValue Tmp =
+ IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
+ V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
+ {Op->getOperand(0), V});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+ }
+
+ if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
+ Op->getSimpleValueType(0) == MVT::v4f64) {
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
+ Constant *Bias = ConstantFP::get(
+ *DAG.getContext(),
+ APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
+ auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
+ SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
+ SDValue VBias = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ /*Alignment*/ 8, MachineMemOperand::MOLoad);
+
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
+ DAG.getBitcast(MVT::v4i64, VBias));
+ Or = DAG.getBitcast(MVT::v4f64, Or);
+
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
+ {Op.getOperand(0), Or, VBias});
+ return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
+ }
+
// The algorithm is the following:
// #ifdef __SSE4_1__
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
@@ -18690,18 +19193,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// return (float4) lo + fhi;
- // We shouldn't use it when unsafe-fp-math is enabled though: we might later
- // reassociate the two FADDs, and if we do that, the algorithm fails
- // spectacularly (PR24512).
- // FIXME: If we ever have some kind of Machine FMF, this should be marked
- // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
- // there's also the MachineCombiner reassociations happening on Machine IR.
- if (DAG.getTarget().Options.UnsafeFPMath)
- return SDValue();
-
- SDLoc DL(Op);
- SDValue V = Op->getOperand(0);
- MVT VecIntVT = V.getSimpleValueType();
bool Is128 = VecIntVT == MVT::v4i32;
MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
// If we convert to something else than the supported type, e.g., to v4f64,
@@ -18709,9 +19200,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
if (VecFloatVT != Op->getSimpleValueType(0))
return SDValue();
- assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
- "Unsupported custom type");
-
// In the #idef/#else code, we have in common:
// - The vector of constants:
// -- 0x4b000000
@@ -18756,23 +19244,35 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
}
- // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
- SDValue VecCstFAdd = DAG.getConstantFP(
- APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
+ // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
+ SDValue VecCstFSub = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // NOTE: By using fsub of a positive constant instead of fadd of a negative
+ // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+ // enabled. See PR24512.
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
// TODO: Are there any fast-math-flags to propagate here?
- SDValue FHigh =
- DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
- // return (float4) lo + fhi;
+ // (float4) lo;
SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+ // return (float4) lo + fhi;
+ if (IsStrict) {
+ SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
+ {Op.getOperand(0), HighBitcast, VecCstFSub});
+ return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
+ {FHigh.getValue(1), LowBitcast, FHigh});
+ }
+
+ SDValue FHigh =
+ DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
}
static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDValue N0 = Op.getOperand(0);
+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
+ SDValue N0 = Op.getOperand(OpNo);
MVT SrcVT = N0.getSimpleValueType();
SDLoc dl(Op);
@@ -18783,18 +19283,23 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
- assert(!Subtarget.hasAVX512());
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
+ case MVT::v2i64:
+ case MVT::v4i64:
+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
}
}
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- SDValue N0 = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- MVT SrcVT = N0.getSimpleValueType();
- MVT DstVT = Op.getSimpleValueType();
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstVT = Op->getSimpleValueType(0);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
if (DstVT == MVT::f128)
return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
@@ -18814,8 +19319,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Promote i32 to i64 and use a signed conversion on 64-bit targets.
if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
- N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);
- return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);
+ Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
+ {Chain, Src});
+ return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
}
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
@@ -18823,7 +19331,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
- if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
@@ -18832,23 +19340,28 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
if (SrcVT == MVT::i32) {
SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
- SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
- StackSlot, MachinePointerInfo());
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
OffsetSlot, MachinePointerInfo());
- SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
- return Fild;
+ std::pair<SDValue, SDValue> Tmp =
+ BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
+
+ return Tmp.first;
}
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
- SDValue ValueToStore = Op.getOperand(0);
- if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
+ SDValue ValueToStore = Src;
+ if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
- MachinePointerInfo());
+ }
+ SDValue Store =
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
@@ -18863,32 +19376,42 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Ops[] = { Store, StackSlot };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
MVT::i64, MMO);
+ Chain = Fild.getValue(1);
- APInt FF(32, 0x5F800000ULL);
// Check whether the sign bit is set.
SDValue SignSet = DAG.getSetCC(
dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
- Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+ Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
- // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+ // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
+ APInt FF(64, 0x5F80000000000000ULL);
SDValue FudgePtr = DAG.getConstantPool(
- ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
+ ConstantInt::get(*DAG.getContext(), FF), PtrVT);
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
- SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
- // FIXME: Avoid the extend by constructing the right constant pool?
SDValue Fudge = DAG.getExtLoad(
- ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+ ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
/* Alignment = */ 4);
+ Chain = Fudge.getValue(1);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
+ if (IsStrict) {
+ SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
+ {Chain, Fild, Fudge});
+ // STRICT_FP_ROUND can't handle equal types.
+ if (DstVT == MVT::f80)
+ return Add;
+ return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
+ {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
+ }
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl));
@@ -18902,11 +19425,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// result.
SDValue
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool IsSigned) const {
+ bool IsSigned, SDValue &Chain) const {
+ bool IsStrict = Op->isStrictFPOpcode();
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
- EVT TheVT = Op.getOperand(0).getValueType();
+ SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
+ EVT TheVT = Value.getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
@@ -18920,6 +19445,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i32. PR44019
if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
@@ -18938,8 +19465,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- SDValue Chain = DAG.getEntryNode();
- SDValue Value = Op.getOperand(0);
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
if (UnsignedFixup) {
@@ -18949,8 +19476,9 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
- // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
- // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
+ // Adjust = (Value < Thresh) ? 0 : 0x80000000;
+ // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+ // FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
// to XOR'ing the high 32 bits with Adjust.
@@ -18975,19 +19503,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
- SDValue Cmp = DAG.getSetCC(DL,
- getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), TheVT),
- Value, ThreshVal, ISD::SETLT);
+ EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT);
+ SDValue Cmp;
+ if (IsStrict) {
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
+ Chain, /*IsSignaling*/ true);
+ Chain = Cmp.getValue(1);
+ } else {
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
+ }
+
Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
DAG.getConstant(0, DL, MVT::i64),
DAG.getConstant(APInt::getSignMask(64),
DL, MVT::i64));
- SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
- Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), TheVT),
- Value, ThreshVal, ISD::SETLT);
- Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
+ DAG.getConstantFP(0.0, DL, TheVT),
+ ThreshVal);
+
+ if (IsStrict) {
+ Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
+ { Chain, Value, FltOfs });
+ Chain = Value.getValue(1);
+ } else
+ Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
}
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
@@ -19017,6 +19557,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
Ops, DstTy, MMO);
SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
+ Chain = Res.getValue(1);
// If we need an unsigned fixup, XOR the result with adjust.
if (UnsignedFixup)
@@ -19036,7 +19577,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode");
- assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
@@ -19512,48 +20053,137 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
}
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
- bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
- MVT VT = Op.getSimpleValueType();
- SDValue Src = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
MVT SrcVT = Src.getSimpleValueType();
SDLoc dl(Op);
- if (SrcVT == MVT::f128) {
- RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(SrcVT, VT);
- else
- LC = RTLIB::getFPTOUINT(SrcVT, VT);
-
- MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
- }
-
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ unsigned Opc;
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+
if (!IsSigned && !Subtarget.hasVLX()) {
+ assert(Subtarget.useAVX512Regs() && "Unexpected features!");
// Widen to 512-bits.
ResVT = MVT::v8i32;
TruncVT = MVT::v8i1;
- Opc = ISD::FP_TO_UINT;
- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
- DAG.getUNDEF(MVT::v8f64),
- Src, DAG.getIntPtrConstant(0, dl));
+ Opc = Op.getOpcode();
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
+ : DAG.getUNDEF(MVT::v8f64);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res =
+ DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Opc, dl, ResVT, Src);
}
- SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
+
Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
- DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
+ if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ assert(Subtarget.useAVX512Regs() && "Requires avx512f");
+ return Op;
+ }
+
+ // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
+ if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
+ (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
+ "Unexpected features!");
+ MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
+ MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp =
+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
+ if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
+ assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
+ !Subtarget.hasVLX() && "Unexpected features!");
+ MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp =
+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
}
- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
- return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
- DAG.getUNDEF(MVT::v2f32)));
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ if (IsStrict) {
+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
+ : X86ISD::STRICT_CVTTP2UI;
+ return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
+ }
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ return DAG.getNode(Opc, dl, VT, Tmp);
}
return SDValue();
@@ -19575,9 +20205,21 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
assert(VT == MVT::i32 && "Unexpected VT!");
// Promote i32 to i64 and use a signed operation on 64-bit targets.
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i32. PR44019
if (Subtarget.is64Bit()) {
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
+ { Op.getOperand(0), Src });
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (IsStrict)
+ return DAG.getMergeValues({ Res, Chain }, dl);
+ return Res;
}
// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
@@ -19586,28 +20228,65 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
- // Promote i16 to i32 if we can use a SSE operation.
- if (VT == MVT::i16 && UseSSEReg) {
+ // Promote i16 to i32 if we can use a SSE operation or the type is f128.
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i16. PR44019
+ if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
+ { Op.getOperand(0), Src });
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (IsStrict)
+ return DAG.getMergeValues({ Res, Chain }, dl);
+ return Res;
}
- // If this is a SINT_TO_FP using SSEReg we're done.
+ // If this is a FP_TO_SINT using SSEReg we're done.
if (UseSSEReg && IsSigned)
return Op;
+ // fp128 needs to use a libcall.
+ if (SrcVT == MVT::f128) {
+ RTLIB::Libcall LC;
+ if (IsSigned)
+ LC = RTLIB::getFPTOSINT(SrcVT, VT);
+ else
+ LC = RTLIB::getFPTOUINT(SrcVT, VT);
+
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ MakeLibCallOptions CallOptions;
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
+ SDLoc(Op), Chain);
+
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
+ }
+
// Fall back to X87.
- if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
+ SDValue Chain;
+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
+ if (IsStrict)
+ return DAG.getMergeValues({V, Chain}, dl);
return V;
+ }
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
- SDValue In = Op.getOperand(0);
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
if (VT == MVT::f128) {
@@ -19617,14 +20296,19 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
- return DAG.getNode(X86ISD::VFPEXT, DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
- In, DAG.getUNDEF(SVT)));
+ SDValue Res =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
+ {Op->getOperand(0), Res});
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
}
SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+
MVT VT = Op.getSimpleValueType();
- SDValue In = Op.getOperand(0);
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
// It's legal except when f128 is involved
@@ -19636,17 +20320,17 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
+
+ SDLoc dl(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
-}
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
+ dl, Chain);
-// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
-// the default expansion of STRICT_FP_ROUND.
-static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
- // FIXME: Need to form a libcall with an input chain for f128.
- assert(Op.getOperand(0).getValueType() != MVT::f128 &&
- "Don't know how to handle f128 yet!");
- return Op;
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -19724,12 +20408,6 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::f128) {
- RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128
- : RTLIB::SUB_F128;
- return LowerF128Call(Op, DAG, LC);
- }
-
assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
@@ -20013,6 +20691,19 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
+// Transform to an x86-specific ALU node with flags if there is a chance of
+// using an RMW op or only the flags are used. Otherwise, leave
+// the node alone and emit a 'cmp' or 'test' instruction.
+static bool isProfitableToUseFlagOp(SDValue Op) {
+ for (SDNode *U : Op->uses())
+ if (U->getOpcode() != ISD::CopyToReg &&
+ U->getOpcode() != ISD::SETCC &&
+ U->getOpcode() != ISD::STORE)
+ return false;
+
+ return true;
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -20076,15 +20767,8 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::SUB:
case ISD::OR:
case ISD::XOR:
- // Transform to an x86-specific ALU node with flags if there is a chance of
- // using an RMW op or only the flags are used. Otherwise, leave
- // the node alone and emit a 'test' instruction.
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI)
- if (UI->getOpcode() != ISD::CopyToReg &&
- UI->getOpcode() != ISD::SETCC &&
- UI->getOpcode() != ISD::STORE)
- goto default_case;
+ if (!isProfitableToUseFlagOp(Op))
+ break;
// Otherwise use a regular EFLAGS-setting instruction.
switch (ArithOp.getOpcode()) {
@@ -20112,7 +20796,6 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
Op->getOperand(1)).getValue(1);
}
default:
- default_case:
break;
}
@@ -20131,15 +20814,26 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
- const SDLoc &dl, SelectionDAG &DAG) const {
+static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
+ unsigned X86CC, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue Chain, bool IsSignaling) {
if (isNullConstant(Op1))
- return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
+ return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);
EVT CmpVT = Op0.getValueType();
- if (CmpVT.isFloatingPoint())
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+ if (CmpVT.isFloatingPoint()) {
+ if (Chain) {
+ SDValue Res =
+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
+ return std::make_pair(Res, Res.getValue(1));
+ }
+ return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),
+ SDValue());
+ }
assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
@@ -20154,7 +20848,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
- isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
// For equality comparisons try to use SIGN_EXTEND if the input was
// truncate from something with enough sign bits.
@@ -20178,10 +20872,22 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
}
}
+
+ // Try to shrink i64 compares if the input has enough zero bits.
+ // FIXME: Do this for non-constant compares for constant on LHS?
+ if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
+ Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
+ cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
+ DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
+ }
+
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return Sub.getValue(1);
+ return std::make_pair(Sub.getValue(1), SDValue());
}
/// Convert a comparison if required by the subtarget.
@@ -20189,16 +20895,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SelectionDAG &DAG) const {
// If the subtarget does not support the FUCOMI instruction, floating-point
// comparisons have to be converted.
- if (Subtarget.hasCMov() ||
- Cmp.getOpcode() != X86ISD::CMP ||
- !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
- !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;
+ bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||
+ Cmp.getOpcode() == X86ISD::STRICT_FCMPS;
+
+ if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||
+ !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())
return Cmp;
// The instruction selector will select an FUCOM instruction instead of
// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
- // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))
SDLoc dl(Cmp);
SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
@@ -20399,7 +21108,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
@@ -20442,7 +21151,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
/// CMPs.
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
- SDValue &Op1) {
+ SDValue &Op1, bool &IsAlwaysSignaling) {
unsigned SSECC;
bool Swap = false;
@@ -20481,6 +21190,22 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
if (Swap)
std::swap(Op0, Op1);
+ switch (SetCCOpcode) {
+ default:
+ IsAlwaysSignaling = true;
+ break;
+ case ISD::SETEQ:
+ case ISD::SETOEQ:
+ case ISD::SETUEQ:
+ case ISD::SETNE:
+ case ISD::SETONE:
+ case ISD::SETUNE:
+ case ISD::SETO:
+ case ISD::SETUO:
+ IsAlwaysSignaling = false;
+ break;
+ }
+
return SSECC;
}
@@ -20625,12 +21350,14 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
- SDValue CC = Op.getOperand(2);
- MVT VT = Op.getSimpleValueType();
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
+ Op.getOpcode() == ISD::STRICT_FSETCCS;
+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
+ SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
+ MVT VT = Op->getSimpleValueType(0);
ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
- bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+ bool isFP = Op1.getSimpleValueType().isFloatingPoint();
SDLoc dl(Op);
if (isFP) {
@@ -20639,57 +21366,119 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
unsigned Opc;
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16);
- Opc = X86ISD::CMPM;
+ Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
} else {
- Opc = X86ISD::CMPP;
+ Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
// The SSE/AVX packed FP comparison nodes are defined with a
// floating-point vector result that matches the operand type. This allows
// them to work with an SSE1 target (integer vector types are not legal).
VT = Op0.getSimpleValueType();
}
- // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
- // emit two comparisons and a logic op to tie them together.
SDValue Cmp;
- unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
- if (SSECC >= 8 && !Subtarget.hasAVX()) {
- // LLVM predicate is SETUEQ or SETONE.
- unsigned CC0, CC1;
- unsigned CombineOpc;
- if (Cond == ISD::SETUEQ) {
- CC0 = 3; // UNORD
- CC1 = 0; // EQ
- CombineOpc = X86ISD::FOR;
+ bool IsAlwaysSignaling;
+ unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
+ if (!Subtarget.hasAVX()) {
+ // TODO: We could use following steps to handle a quiet compare with
+ // signaling encodings.
+ // 1. Get ordered masks from a quiet ISD::SETO
+ // 2. Use the masks to mask potential unordered elements in operand A, B
+ // 3. Get the compare results of masked A, B
+ // 4. Calculating final result using the mask and result from 3
+ // But currently, we just fall back to scalar operations.
+ if (IsStrict && IsAlwaysSignaling && !IsSignaling)
+ return SDValue();
+
+ // Insert an extra signaling instruction to raise exception.
+ if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
+ SDValue SignalCmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
+ // FIXME: It seems we need to update the flags of all new strict nodes.
+ // Otherwise, mayRaiseFPException in MI will return false due to
+ // NoFPExcept = false by default. However, I didn't find it in other
+ // patches.
+ SignalCmp->setFlags(Op->getFlags());
+ Chain = SignalCmp.getValue(1);
+ }
+
+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+ // emit two comparisons and a logic op to tie them together.
+ if (SSECC >= 8) {
+ // LLVM predicate is SETUEQ or SETONE.
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
+ if (Cond == ISD::SETUEQ) {
+ CC0 = 3; // UNORD
+ CC1 = 0; // EQ
+ CombineOpc = X86ISD::FOR;
+ } else {
+ assert(Cond == ISD::SETONE);
+ CC0 = 7; // ORD
+ CC1 = 4; // NEQ
+ CombineOpc = X86ISD::FAND;
+ }
+
+ SDValue Cmp0, Cmp1;
+ if (IsStrict) {
+ Cmp0 = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
+ Cmp1 = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
+ Cmp1.getValue(1));
+ } else {
+ Cmp0 = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
+ Cmp1 = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
+ }
+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
- assert(Cond == ISD::SETONE);
- CC0 = 7; // ORD
- CC1 = 4; // NEQ
- CombineOpc = X86ISD::FAND;
+ if (IsStrict) {
+ Cmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
+ Chain = Cmp.getValue(1);
+ } else
+ Cmp = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
-
- SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getTargetConstant(CC0, dl, MVT::i8));
- SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getTargetConstant(CC1, dl, MVT::i8));
- Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
// Handle all other FP comparisons here.
- Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getTargetConstant(SSECC, dl, MVT::i8));
+ if (IsStrict) {
+ // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
+ SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
+ Cmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
+ Chain = Cmp.getValue(1);
+ } else
+ Cmp = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
// result type of SETCC. The bitcast is expected to be optimized away
// during combining/isel.
- if (Opc == X86ISD::CMPP)
- Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cmp, Chain}, dl);
return Cmp;
}
+ assert(!IsStrict && "Strict SETCC only handles FP operands.");
+
MVT VTOp0 = Op0.getSimpleValueType();
(void)VTOp0;
assert(VTOp0 == Op1.getSimpleValueType() &&
@@ -20860,6 +21649,30 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
assert(Subtarget.hasSSE2() && "Don't know how to lower!");
+ // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
+ // the odd elements over the even elements.
+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
+ Op0 = DAG.getConstant(0, dl, MVT::v4i32);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
+
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
@@ -20999,8 +21812,9 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
/// corresponding X86 condition code constant in X86CC.
SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
ISD::CondCode CC, const SDLoc &dl,
- SelectionDAG &DAG,
- SDValue &X86CC) const {
+ SelectionDAG &DAG, SDValue &X86CC,
+ SDValue &Chain,
+ bool IsSignaling) const {
// Optimize to BT if possible.
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
@@ -21043,12 +21857,32 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
}
}
+ // Try to use the carry flag from the add in place of an separate CMP for:
+ // (seteq (add X, -1), -1). Similar for setne.
+ if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
+ Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isProfitableToUseFlagOp(Op0)) {
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+
+ SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
+ Op0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
+ X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
+ return SDValue(New.getNode(), 1);
+ }
+ }
+
bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
if (CondCode == X86::COND_INVALID)
return SDValue();
- SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
+ std::pair<SDValue, SDValue> Tmp =
+ EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);
+ SDValue EFLAGS = Tmp.first;
+ if (Chain)
+ Chain = Tmp.second;
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
@@ -21056,35 +21890,48 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- MVT VT = Op.getSimpleValueType();
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
+ Op.getOpcode() == ISD::STRICT_FSETCCS;
+ MVT VT = Op->getSimpleValueType(0);
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
SDLoc dl(Op);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets handled by emitFlagsForSetcc.
if (Op0.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);
+ softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
+ Op.getOpcode() == ISD::STRICT_FSETCCS);
// If softenSetCCOperands returned a scalar, use it.
if (!Op1.getNode()) {
assert(Op0.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
+ if (IsStrict)
+ return DAG.getMergeValues({Op0, Chain}, dl);
return Op0;
}
}
SDValue X86CC;
- SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,
+ Op.getOpcode() == ISD::STRICT_FSETCCS);
if (!EFLAGS)
return SDValue();
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
}
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
@@ -21215,8 +22062,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
(Subtarget.hasSSE1() && VT == MVT::f32)) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
- unsigned SSECC = translateX86FSETCC(
- cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+ bool IsAlwaysSignaling;
+ unsigned SSECC =
+ translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
+ CondOp0, CondOp1, IsAlwaysSignaling);
if (Subtarget.hasAVX512()) {
SDValue Cmp =
@@ -21454,8 +22303,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (AddTest) {
CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
- Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
- X86::COND_NE, DL, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
}
// a < b ? -1 : 0 -> RES = ~setcc_carry
@@ -21711,7 +22559,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
- assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
@@ -21765,12 +22613,14 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
"Expecting 256/512-bit op");
// Splitting volatile memory ops is not allowed unless the operation was not
- // legal to begin with. We are assuming the input op is legal (this transform
- // is only used for targets with AVX).
+ // legal to begin with. Assume the input store is legal (this transform is
+ // only used for targets with AVX). Note: It is possible that we have an
+ // illegal type like v2i128, and so we could allow splitting a volatile store
+ // in that case if that is important.
if (!Store->isSimple())
return SDValue();
- MVT StoreVT = StoredVal.getSimpleValueType();
+ EVT StoreVT = StoredVal.getValueType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
@@ -22174,8 +23024,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
- Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
- X86Cond, dl, DAG);
+ Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -22201,7 +23050,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- unsigned Align = Op.getConstantOperandVal(2);
+ MaybeAlign Alignment(Op.getConstantOperandVal(2));
EVT VT = Node->getValueType(0);
// Chain the dynamic stack allocation so that it doesn't modify the stack
@@ -22221,11 +23070,12 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- unsigned StackAlign = TFI.getStackAlignment();
+ const Align StackAlign(TFI.getStackAlignment());
Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
- if (Align > StackAlign)
- Result = DAG.getNode(ISD::AND, dl, VT, Result,
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ if (Alignment && Alignment > StackAlign)
+ Result =
+ DAG.getNode(ISD::AND, dl, VT, Result,
+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
} else if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -22256,9 +23106,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
- if (Align) {
+ if (Alignment) {
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
}
@@ -22777,6 +23627,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntNo = Op.getConstantOperandVal(0);
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_1OP: {
@@ -22794,7 +23645,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1));
}
case INTR_TYPE_1OP_SAE: {
SDValue Sae = Op.getOperand(2);
@@ -22866,7 +23718,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
- Src1, Src2, Src3);
+ {Src1, Src2, Src3});
}
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
@@ -22890,8 +23742,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
- Mask, PassThru, Subtarget, DAG);
+ return getVectorMaskingNode(
+ DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
+ Subtarget, DAG);
}
case INTR_TYPE_1OP_MASK_SAE: {
SDValue Src = Op.getOperand(1);
@@ -22907,8 +23760,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
return SDValue();
- return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
- Mask, PassThru, Subtarget, DAG);
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
+ Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
@@ -23114,8 +23967,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
}
//default rounding mode
- return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC);
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ {Op.getOperand(1), Op.getOperand(2), CC});
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -23315,8 +24168,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
- Mask);
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
+ {Src, PassThru, Mask});
}
case CVTPS2PH_MASK: {
SDValue Src = Op.getOperand(1);
@@ -23622,9 +24475,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue ShAmt = Op.getOperand(2);
// If the argument is a constant, convert it to a target constant.
if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
- ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ // Clamp out of bounds shift amounts since they will otherwise be masked
+ // to 8-bits which may make it no longer out of bounds.
+ unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
- Op.getOperand(0), Op.getOperand(1), ShAmt);
+ Op.getOperand(0), Op.getOperand(1),
+ DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
}
unsigned NewIntrinsic;
@@ -23977,7 +24833,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
// during FinalizeISel in EmitInstrWithCustomInserter.
- return SDValue();
+ return Op;
}
case Intrinsic::x86_lwpins32:
case Intrinsic::x86_lwpins64:
@@ -24152,9 +25008,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDValue Offset = DAG.getUNDEF(VMask.getValueType());
- return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
- MemIntr->getMemOperand(), true /* truncating */);
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
+ MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
+ true /* truncating */);
}
case X86ISD::VTRUNCUS:
case X86ISD::VTRUNCS: {
@@ -24249,7 +25107,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
@@ -24538,12 +25396,13 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
MachineFunction &MF = DAG.getMachineFunction();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- unsigned StackAlignment = TFI.getStackAlignment();
+ const Align StackAlignment(TFI.getStackAlignment());
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
- int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
@@ -27464,12 +28323,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
return Op;
- SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
- N->getBasePtr(), Mask,
- getZeroVector(VT, Subtarget, DAG, dl),
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType(),
- N->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
+ N->isExpandingLoad());
// Emit a blend.
SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
PassThru);
@@ -27503,11 +28361,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
- N->getBasePtr(), Mask, PassThru,
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType(),
- N->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
NewLoad.getValue(0),
@@ -27553,7 +28410,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
- Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->getOffset(), Mask, N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(),
N->isTruncatingStore(), N->isCompressingStore());
}
@@ -27607,29 +28465,31 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
}
-SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
- SelectionDAG &DAG) const {
- // TODO: Eventually, the lowering of these nodes should be informed by or
- // deferred to the GC strategy for the function in which they appear. For
- // now, however, they must be lowered to something. Since they are logically
- // no-ops in the case of a null GC strategy (or a GC strategy which does not
- // require special handling for these nodes), lower them as literal NOOPs for
- // the time being.
- SmallVector<SDValue, 2> Ops;
+static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ SDValue Src = Op.getOperand(0);
+ MVT DstVT = Op.getSimpleValueType();
- Ops.push_back(Op.getOperand(0));
- if (Op->getGluedNode())
- Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+ AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
+ unsigned SrcAS = N->getSrcAddressSpace();
- SDLoc OpDL(Op);
- SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+ assert(SrcAS != N->getDestAddressSpace() &&
+ "addrspacecast must be between different address spaces");
- return NOOP;
+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+ } else if (DstVT == MVT::i64) {
+ Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+ } else if (DstVT == MVT::i32) {
+ Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+ } else {
+ report_fatal_error("Bad address space in addrspacecast");
+ }
+ return Op;
}
-SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
+ SelectionDAG &DAG) const {
// TODO: Eventually, the lowering of these nodes should be informed by or
// deferred to the GC strategy for the function in which they appear. For
// now, however, they must be lowered to something. Since they are logically
@@ -27651,9 +28511,21 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned Offset = IsStrict ? 1 : 0;
+ SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
+
+ SDLoc dl(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
+ CallOptions, dl, Chain);
+
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
}
/// Provide custom lowering hooks for some operations.
@@ -27673,7 +28545,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
- case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
@@ -27690,7 +28562,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
+ case ISD::STRICT_SINT_TO_FP:
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::STRICT_UINT_TO_FP:
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
@@ -27700,21 +28574,24 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
- case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
- case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG);
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG);
- case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
- case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
- case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::SETCC:
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
@@ -27778,8 +28655,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
- return LowerGC_TRANSITION_START(Op, DAG);
- case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
+ case ISD::ADDRSPACECAST:
+ return LowerADDRSPACECAST(Op, DAG);
}
}
@@ -27865,8 +28743,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
- // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
- // X86ISD::AVG/VPMADDWD by widening.
+ // Legalize types for X86ISD::AVG/VPMADDWD by widening.
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
@@ -28114,10 +28991,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: {
- bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_UINT: {
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
+ N->getOpcode() == ISD::STRICT_FP_TO_SINT;
EVT VT = N->getValueType(0);
- SDValue Src = N->getOperand(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
@@ -28128,13 +29009,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
VT.getVectorNumElements());
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
+ SDValue Res;
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
// Preserve what we know about the size of the original result. Except
// when the result is v2i32 since we can't widen the assert.
if (PromoteVT != MVT::v2i32)
- Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
- : ISD::AssertSext,
+ Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
dl, PromoteVT, Res,
DAG.getValueType(VT.getVectorElementType()));
@@ -28149,6 +29036,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
return;
}
@@ -28160,16 +29049,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
if (Src.getValueType() == MVT::v2f64) {
+ unsigned Opc;
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+
+ // If we have VLX we can emit a target specific FP_TO_UINT node,.
if (!IsSigned && !Subtarget.hasVLX()) {
- // If we have VLX we can emit a target specific FP_TO_UINT node,
- // otherwise we can defer to the generic legalizer which will widen
+ // Otherwise we can defer to the generic legalizer which will widen
// the input as well. This will be further widened during op
// legalization to v8i32<-v8f64.
- return;
+ // For strict nodes we'll need to widen ourselves.
+ // FIXME: Fix the type legalizer to safely widen strict nodes?
+ if (!IsStrict)
+ return;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
+ DAG.getConstantFP(0.0, dl, MVT::v2f64));
+ Opc = N->getOpcode();
+ }
+ SDValue Res;
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
}
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
- SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Custom widen strict v2f32->v2i32 by padding with zeros.
+ // FIXME: Should generic type legalizer do this?
+ if (Src.getValueType() == MVT::v2f32 && IsStrict) {
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getConstantFP(0.0, dl, MVT::v2f32));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
return;
}
@@ -28183,64 +29105,168 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
- unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
- // Using a 256-bit input here to guarantee 128-bit input for f32 case.
- // TODO: Use 128-bit vectors for f64 case?
- // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
+ unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
+ // If we use a 128-bit result we might need to use a target specific node.
+ unsigned SrcElts =
+ std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
- MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
+ MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
+ unsigned Opc = N->getOpcode();
+ if (NumElts != SrcElts) {
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ }
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
DAG.getConstantFP(0.0, dl, VecInVT), Src,
ZeroIdx);
- Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
+ SDValue Chain;
+ if (IsStrict) {
+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+ Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
return;
}
- if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
+ SDValue Chain;
+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
Results.push_back(V);
+ if (IsStrict)
+ Results.push_back(Chain);
+ }
return;
}
- case ISD::SINT_TO_FP: {
- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
- SDValue Src = N->getOperand(0);
- if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
- return;
- Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
- return;
- }
- case ISD::UINT_TO_FP: {
- assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP: {
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
+ N->getOpcode() == ISD::STRICT_SINT_TO_FP;
EVT VT = N->getValueType(0);
if (VT != MVT::v2f32)
return;
- SDValue Src = N->getOperand(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
- Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
+ if (IsStrict) {
+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
+ : X86ISD::STRICT_CVTUI2P;
+ SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ } else {
+ unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
+ Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
+ }
return;
}
+ if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
+ Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
+ SDValue Zero = DAG.getConstant(0, dl, SrcVT);
+ SDValue One = DAG.getConstant(1, dl, SrcVT);
+ SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
+ DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
+ DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
+ SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
+ SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
+ SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
+ for (int i = 0; i != 2; ++i) {
+ SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ SignSrc, DAG.getIntPtrConstant(i, dl));
+ if (IsStrict)
+ SignCvts[i] =
+ DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
+ {N->getOperand(0), Src});
+ else
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
+ };
+ SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
+ SDValue Slow, Chain;
+ if (IsStrict) {
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SignCvts[0].getValue(1), SignCvts[1].getValue(1));
+ Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
+ {Chain, SignCvt, SignCvt});
+ Chain = Slow.getValue(1);
+ } else {
+ Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
+ }
+ IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
+ IsNeg =
+ DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
+ SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
+ Results.push_back(Cvt);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
if (SrcVT != MVT::v2i32)
return;
+
+ if (IsSigned || Subtarget.hasAVX512()) {
+ if (!IsStrict)
+ return;
+
+ // Custom widen strict v2i32->v2f32 to avoid scalarization.
+ // FIXME: Should generic type legalizer do this?
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getConstant(0, dl, MVT::v2i32));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
SDValue VBias =
DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
- // TODO: Are there any fast-math-flags to propagate here?
- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
- Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ if (IsStrict) {
+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
+ {N->getOperand(0), Or, VBias});
+ SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
+ {MVT::v4f32, MVT::Other},
+ {Sub.getValue(1), Sub});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ } else {
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ }
return;
}
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: {
- if (!isTypeLegal(N->getOperand(0).getValueType()))
- return;
- SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ if (!isTypeLegal(Src.getValueType()))
+ return;
+ SDValue V;
+ if (IsStrict)
+ V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), N->getOperand(1)});
+ else
+ V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
Results.push_back(V);
+ if (IsStrict)
+ Results.push_back(V.getValue(1));
return;
}
case ISD::FP_EXTEND: {
@@ -28543,6 +29569,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res.getValue(1));
return;
}
+ case ISD::ADDRSPACECAST: {
+ SDValue Src = N->getOperand(0);
+ EVT DstVT = N->getValueType(0);
+ AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+ unsigned SrcAS = CastN->getSrcAddressSpace();
+
+ assert(SrcAS != CastN->getDestAddressSpace() &&
+ "addrspacecast must be between different address spaces");
+
+ SDValue Res;
+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
+ Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+ else if (DstVT == MVT::i64)
+ Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+ else if (DstVT == MVT::i32)
+ Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+ else
+ report_fatal_error("Unrecognized addrspacecast type legalization");
+
+ Results.push_back(Res);
+ return;
+ }
}
}
@@ -28566,9 +29614,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CALL: return "X86ISD::CALL";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";
+ case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
+ case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";
case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
@@ -28653,10 +29704,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";
case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
+ case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";
case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
@@ -28676,6 +29729,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VROTRI: return "X86ISD::VROTRI";
case X86ISD::VPPERM: return "X86ISD::VPPERM";
case X86ISD::CMPP: return "X86ISD::CMPP";
+ case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";
case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
@@ -28776,6 +29830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";
case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
@@ -28837,6 +29892,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
+ case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";
+ case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";
case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
@@ -28847,6 +29904,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
+ case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";
+ case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";
case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
@@ -29099,8 +30158,8 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return true;
}
-bool
-X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
if (!Subtarget.hasAnyFMA())
return false;
@@ -31518,28 +32577,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case X86ISD::VSRAI:
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
- Known.setAllZero();
- break;
- }
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= VT.getScalarSizeInBits()) {
+ Known.setAllZero();
+ break;
+ }
- Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
- unsigned ShAmt = ShiftImm->getZExtValue();
- if (Opc == X86ISD::VSHLI) {
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
- // Low bits are known zero.
- Known.Zero.setLowBits(ShAmt);
- } else if (Opc == X86ISD::VSRLI) {
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
- // High bits are known zero.
- Known.Zero.setHighBits(ShAmt);
- } else {
- Known.Zero.ashrInPlace(ShAmt);
- Known.One.ashrInPlace(ShAmt);
- }
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Opc == X86ISD::VSHLI) {
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
+ // Low bits are known zero.
+ Known.Zero.setLowBits(ShAmt);
+ } else if (Opc == X86ISD::VSRLI) {
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
+ // High bits are known zero.
+ Known.Zero.setHighBits(ShAmt);
+ } else {
+ Known.Zero.ashrInPlace(ShAmt);
+ Known.One.ashrInPlace(ShAmt);
}
break;
}
@@ -32103,8 +33160,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
- if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
- Subtarget)) {
+ if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
+ Subtarget)) {
DstVT = MaskVT;
return true;
}
@@ -32116,8 +33173,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
- if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
- DAG, Subtarget)) {
+ if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
+ Subtarget)) {
SrcVT = DstVT = MaskVT;
if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
@@ -32155,8 +33212,8 @@ static bool matchBinaryPermuteShuffle(
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
- if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
- ForceV2Zero, BlendMask)) {
+ if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
+ ForceV2Zero, BlendMask)) {
if (MaskVT == MVT::v16i16) {
// We can only use v16i16 PBLENDW if the lanes are repeated.
SmallVector<int, 8> RepeatedMask;
@@ -32410,10 +33467,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
- APInt Zeroable(NumMaskElts, 0);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- if (isUndefOrZero(Mask[i]))
- Zeroable.setBit(i);
+ APInt KnownUndef, KnownZero;
+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+ APInt Zeroable = KnownUndef | KnownZero;
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
@@ -32834,7 +33890,8 @@ static SDValue combineX86ShuffleChainWithExtract(
Offset += Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
}
- WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
+ WideSizeInBits = std::max(WideSizeInBits,
+ (unsigned)Src.getValueSizeInBits());
assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
"Unexpected subvector extraction");
Offset /= BaseVT.getVectorNumElements();
@@ -33026,6 +34083,10 @@ static SDValue combineX86ShufflesRecursively(
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ assert(RootMask.size() > 0 &&
+ (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
+ "Illegal shuffle root mask");
+
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
@@ -33056,106 +34117,137 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
- resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
-
- // Add the inputs to the Ops list, avoiding duplicates.
- SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
-
- auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
- // Attempt to find an existing match.
- SDValue InputBC = peekThroughBitcasts(Input);
- for (int i = 0, e = Ops.size(); i < e; ++i)
- if (InputBC == peekThroughBitcasts(Ops[i]))
- return i;
- // Match failed - should we replace an existing Op?
- if (InsertionPoint >= 0) {
- Ops[InsertionPoint] = Input;
- return InsertionPoint;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 16> Ops;
+
+ // We don't need to merge masks if the root is empty.
+ bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
+ if (EmptyRoot) {
+ // Only resolve zeros if it will remove an input, otherwise we might end
+ // up in an infinite loop.
+ bool ResolveKnownZeros = true;
+ if (!OpZero.isNullValue()) {
+ APInt UsedInputs = APInt::getNullValue(OpInputs.size());
+ for (int i = 0, e = OpMask.size(); i != e; ++i) {
+ int M = OpMask[i];
+ if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
+ continue;
+ UsedInputs.setBit(M / OpMask.size());
+ if (UsedInputs.isAllOnesValue()) {
+ ResolveKnownZeros = false;
+ break;
+ }
+ }
}
- // Add to the end of the Ops list.
- Ops.push_back(Input);
- return Ops.size() - 1;
- };
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
+ ResolveKnownZeros);
- SmallVector<int, 2> OpInputIdx;
- for (SDValue OpInput : OpInputs)
- OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
-
- assert(((RootMask.size() > OpMask.size() &&
- RootMask.size() % OpMask.size() == 0) ||
- (OpMask.size() > RootMask.size() &&
- OpMask.size() % RootMask.size() == 0) ||
- OpMask.size() == RootMask.size()) &&
- "The smaller number of elements must divide the larger.");
-
- // This function can be performance-critical, so we rely on the power-of-2
- // knowledge that we have about the mask sizes to replace div/rem ops with
- // bit-masks and shifts.
- assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
- assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
- unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
- unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
-
- unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
- unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
- unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
- assert((RootRatio == 1 || OpRatio == 1) &&
- "Must not have a ratio for both incoming and op masks!");
-
- assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
- assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
- assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
- unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
- unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
-
- SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
-
- // Merge this shuffle operation's mask into our accumulated mask. Note that
- // this shuffle's mask will be the first applied to the input, followed by the
- // root mask to get us all the way to the root value arrangement. The reason
- // for this order is that we are recursing up the operation chain.
- for (unsigned i = 0; i < MaskWidth; ++i) {
- unsigned RootIdx = i >> RootRatioLog2;
- if (RootMask[RootIdx] < 0) {
- // This is a zero or undef lane, we're done.
- Mask[i] = RootMask[RootIdx];
- continue;
- }
+ Mask = OpMask;
+ Ops.append(OpInputs.begin(), OpInputs.end());
+ } else {
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
+ // Add the inputs to the Ops list, avoiding duplicates.
+ Ops.append(SrcOps.begin(), SrcOps.end());
+
+ auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
+ // Attempt to find an existing match.
+ SDValue InputBC = peekThroughBitcasts(Input);
+ for (int i = 0, e = Ops.size(); i < e; ++i)
+ if (InputBC == peekThroughBitcasts(Ops[i]))
+ return i;
+ // Match failed - should we replace an existing Op?
+ if (InsertionPoint >= 0) {
+ Ops[InsertionPoint] = Input;
+ return InsertionPoint;
+ }
+ // Add to the end of the Ops list.
+ Ops.push_back(Input);
+ return Ops.size() - 1;
+ };
- unsigned RootMaskedIdx =
- RootRatio == 1
- ? RootMask[RootIdx]
- : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
+ SmallVector<int, 2> OpInputIdx;
+ for (SDValue OpInput : OpInputs)
+ OpInputIdx.push_back(
+ AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
+
+ assert(((RootMask.size() > OpMask.size() &&
+ RootMask.size() % OpMask.size() == 0) ||
+ (OpMask.size() > RootMask.size() &&
+ OpMask.size() % RootMask.size() == 0) ||
+ OpMask.size() == RootMask.size()) &&
+ "The smaller number of elements must divide the larger.");
+
+ // This function can be performance-critical, so we rely on the power-of-2
+ // knowledge that we have about the mask sizes to replace div/rem ops with
+ // bit-masks and shifts.
+ assert(isPowerOf2_32(RootMask.size()) &&
+ "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
+
+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
+ unsigned RootRatio =
+ std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
+ assert((RootRatio == 1 || OpRatio == 1) &&
+ "Must not have a ratio for both incoming and op masks!");
+
+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
+
+ Mask.resize(MaskWidth, SM_SentinelUndef);
+
+ // Merge this shuffle operation's mask into our accumulated mask. Note that
+ // this shuffle's mask will be the first applied to the input, followed by
+ // the root mask to get us all the way to the root value arrangement. The
+ // reason for this order is that we are recursing up the operation chain.
+ for (unsigned i = 0; i < MaskWidth; ++i) {
+ unsigned RootIdx = i >> RootRatioLog2;
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask[i] = RootMask[RootIdx];
+ continue;
+ }
- // Just insert the scaled root mask value if it references an input other
- // than the SrcOp we're currently inserting.
- if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
- (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
- Mask[i] = RootMaskedIdx;
- continue;
- }
+ unsigned RootMaskedIdx =
+ RootRatio == 1
+ ? RootMask[RootIdx]
+ : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
- RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
- unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
- if (OpMask[OpIdx] < 0) {
- // The incoming lanes are zero or undef, it doesn't matter which ones we
- // are using.
- Mask[i] = OpMask[OpIdx];
- continue;
- }
+ // Just insert the scaled root mask value if it references an input other
+ // than the SrcOp we're currently inserting.
+ if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
+ (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
+ Mask[i] = RootMaskedIdx;
+ continue;
+ }
- // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
- unsigned OpMaskedIdx =
- OpRatio == 1
- ? OpMask[OpIdx]
- : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
+ RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask[i] = OpMask[OpIdx];
+ continue;
+ }
- OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
- int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
- assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
- OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
+ // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
+ unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
+ : (OpMask[OpIdx] << OpRatioLog2) +
+ (RootMaskedIdx & (OpRatio - 1));
- Mask[i] = OpMaskedIdx;
+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
+
+ Mask[i] = OpMaskedIdx;
+ }
}
// Remove unused/repeated shuffle source ops.
@@ -33189,13 +34281,18 @@ static SDValue combineX86ShufflesRecursively(
// the remaining recursion depth.
if (Ops.size() < (MaxRecursionDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
+ // For empty roots, we need to resolve zeroable elements before combining
+ // them with other shuffles.
+ SmallVector<int, 64> ResolvedMask = Mask;
+ if (EmptyRoot)
+ resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
bool AllowVar = false;
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
- AllowVar, DAG, Subtarget))
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
+ HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
}
}
@@ -34207,6 +35304,15 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
In.getOperand(0).getValueType() == MVT::v2i64)
return N->getOperand(0); // return the bitcast
break;
+ case X86ISD::STRICT_CVTTP2SI:
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::STRICT_CVTSI2P:
+ case X86ISD::STRICT_CVTUI2P:
+ case X86ISD::STRICT_VFPROUND:
+ if (In.getOperand(1).getValueType() == MVT::v2f64 ||
+ In.getOperand(1).getValueType() == MVT::v2i64)
+ return N->getOperand(0);
+ break;
}
}
@@ -34698,6 +35804,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
}
+ // If we don't demand all elements, then attempt to combine to a simpler
+ // shuffle.
+ // TODO: Handle other depths, but first we need to handle the fact that
+ // it might combine to the same shuffle.
+ if (!DemandedElts.isAllOnesValue() && Depth == 0) {
+ SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
+ for (int i = 0; i != NumElts; ++i)
+ if (DemandedElts[i])
+ DemandedMask[i] = i;
+
+ SDValue NewShuffle = combineX86ShufflesRecursively(
+ {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, TLO.DAG, Subtarget);
+ if (NewShuffle)
+ return TLO.CombineTo(Op, NewShuffle);
+ }
+
return false;
}
@@ -34739,117 +35862,110 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::VSHLI: {
SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
- if (ShiftImm->getAPIntValue().uge(BitWidth))
- break;
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= BitWidth)
+ break;
- unsigned ShAmt = ShiftImm->getZExtValue();
- APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
-
- // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
- // single shift. We can do this if the bottom bits (which are shifted
- // out) are never demanded.
- if (Op0.getOpcode() == X86ISD::VSRLI &&
- OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
- if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
- if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
- int Diff = ShAmt - Shift2Imm->getZExtValue();
- if (Diff == 0)
- return TLO.CombineTo(Op, Op0.getOperand(0));
-
- unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
- SDValue NewShift = TLO.DAG.getNode(
- NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
- TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
- return TLO.CombineTo(Op, NewShift);
- }
- }
+ APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
+
+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the bottom bits (which are shifted
+ // out) are never demanded.
+ if (Op0.getOpcode() == X86ISD::VSRLI &&
+ OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
+ unsigned Shift2Amt = Op0.getConstantOperandVal(1);
+ if (Shift2Amt < BitWidth) {
+ int Diff = ShAmt - Shift2Amt;
+ if (Diff == 0)
+ return TLO.CombineTo(Op, Op0.getOperand(0));
+
+ unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
+ SDValue NewShift = TLO.DAG.getNode(
+ NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
+ TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
+ return TLO.CombineTo(Op, NewShift);
}
+ }
- if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
- TLO, Depth + 1))
- return true;
+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
- // Low bits known zero.
- Known.Zero.setLowBits(ShAmt);
- }
+ // Low bits known zero.
+ Known.Zero.setLowBits(ShAmt);
break;
}
case X86ISD::VSRLI: {
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- if (ShiftImm->getAPIntValue().uge(BitWidth))
- break;
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= BitWidth)
+ break;
- unsigned ShAmt = ShiftImm->getZExtValue();
- APInt DemandedMask = OriginalDemandedBits << ShAmt;
+ APInt DemandedMask = OriginalDemandedBits << ShAmt;
- if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
- OriginalDemandedElts, Known, TLO, Depth + 1))
- return true;
+ if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
+ OriginalDemandedElts, Known, TLO, Depth + 1))
+ return true;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
- // High bits known zero.
- Known.Zero.setHighBits(ShAmt);
- }
+ // High bits known zero.
+ Known.Zero.setHighBits(ShAmt);
break;
}
case X86ISD::VSRAI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
- if (ShiftImm->getAPIntValue().uge(BitWidth))
- break;
+ unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
+ if (ShAmt >= BitWidth)
+ break;
- unsigned ShAmt = ShiftImm->getZExtValue();
- APInt DemandedMask = OriginalDemandedBits << ShAmt;
+ APInt DemandedMask = OriginalDemandedBits << ShAmt;
- // If we just want the sign bit then we don't need to shift it.
- if (OriginalDemandedBits.isSignMask())
- return TLO.CombineTo(Op, Op0);
+ // If we just want the sign bit then we don't need to shift it.
+ if (OriginalDemandedBits.isSignMask())
+ return TLO.CombineTo(Op, Op0);
- // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
- if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
- SDValue Op00 = Op0.getOperand(0);
- unsigned NumSignBits =
- TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
- if (ShAmt < NumSignBits)
- return TLO.CombineTo(Op, Op00);
- }
+ // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
+ if (Op0.getOpcode() == X86ISD::VSHLI &&
+ Op.getOperand(1) == Op0.getOperand(1)) {
+ SDValue Op00 = Op0.getOperand(0);
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
+ if (ShAmt < NumSignBits)
+ return TLO.CombineTo(Op, Op00);
+ }
- // If any of the demanded bits are produced by the sign extension, we also
- // demand the input sign bit.
- if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
- DemandedMask.setSignBit();
+ // If any of the demanded bits are produced by the sign extension, we also
+ // demand the input sign bit.
+ if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
+ DemandedMask.setSignBit();
- if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
- TLO, Depth + 1))
- return true;
+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
- // If the input sign bit is known to be zero, or if none of the top bits
- // are demanded, turn this into an unsigned shift right.
- if (Known.Zero[BitWidth - ShAmt - 1] ||
- OriginalDemandedBits.countLeadingZeros() >= ShAmt)
- return TLO.CombineTo(
- Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
+ // If the input sign bit is known to be zero, or if none of the top bits
+ // are demanded, turn this into an unsigned shift right.
+ if (Known.Zero[BitWidth - ShAmt - 1] ||
+ OriginalDemandedBits.countLeadingZeros() >= ShAmt)
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
- // High bits are known one.
- if (Known.One[BitWidth - ShAmt - 1])
- Known.One.setHighBits(ShAmt);
- }
+ // High bits are known one.
+ if (Known.One[BitWidth - ShAmt - 1])
+ Known.One.setHighBits(ShAmt);
break;
}
case X86ISD::PEXTRB:
@@ -35005,6 +36121,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
return Vec;
break;
}
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (DemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return Op.getOperand(1);
+ break;
}
APInt ShuffleUndef, ShuffleZero;
@@ -35053,123 +36176,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);
}
-/// Check if a vector extract from a target-specific shuffle of a load can be
-/// folded into a single element load.
-/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
-/// shuffles have been custom lowered so we need to handle those here.
-static SDValue
-XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- SDValue InVec = N->getOperand(0);
- SDValue EltNo = N->getOperand(1);
- EVT EltVT = N->getValueType(0);
-
- if (!isa<ConstantSDNode>(EltNo))
- return SDValue();
-
- EVT OriginalVT = InVec.getValueType();
- unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
-
- // Peek through bitcasts, don't duplicate a load with other uses.
- InVec = peekThroughOneUseBitcasts(InVec);
-
- EVT CurrentVT = InVec.getValueType();
- if (!CurrentVT.isVector())
- return SDValue();
-
- unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
- if ((NumOriginalElts % NumCurrentElts) != 0)
- return SDValue();
-
- if (!isTargetShuffle(InVec.getOpcode()))
- return SDValue();
-
- // Don't duplicate a load with other uses.
- if (!InVec.hasOneUse())
- return SDValue();
-
- SmallVector<int, 16> ShuffleMask;
- SmallVector<SDValue, 2> ShuffleOps;
- bool UnaryShuffle;
- if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
- ShuffleOps, ShuffleMask, UnaryShuffle))
- return SDValue();
-
- unsigned Scale = NumOriginalElts / NumCurrentElts;
- if (Scale > 1) {
- SmallVector<int, 16> ScaledMask;
- scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
- ShuffleMask = std::move(ScaledMask);
- }
- assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
-
- // Select the input vector, guarding against out of range extract vector.
- int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
- int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
-
- if (Idx == SM_SentinelZero)
- return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
- : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
- if (Idx == SM_SentinelUndef)
- return DAG.getUNDEF(EltVT);
-
- // Bail if any mask element is SM_SentinelZero - getVectorShuffle below
- // won't handle it.
- if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
- return SDValue();
-
- assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
- "Shuffle index out of range");
- SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
-
- // If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses =
- (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
-
- if (LdNode.getOpcode() == ISD::BITCAST) {
- // Don't duplicate a load with other uses.
- if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
- return SDValue();
-
- AllowedUses = 1; // only allow 1 load use if we have a bitcast
- LdNode = LdNode.getOperand(0);
- }
-
- if (!ISD::isNormalLoad(LdNode.getNode()))
- return SDValue();
-
- LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
-
- if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
- return SDValue();
-
- // If there's a bitcast before the shuffle, check if the load type and
- // alignment is valid.
- unsigned Align = LN0->getAlignment();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
- EltVT.getTypeForEVT(*DAG.getContext()));
-
- if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
- return SDValue();
-
- // All checks match so transform back to vector_shuffle so that DAG combiner
- // can finish the job
- SDLoc dl(N);
-
- // Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
- : DAG.getBitcast(OriginalVT, ShuffleOps[1]);
- Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
- DAG.getBitcast(OriginalVT, ShuffleOps[0]),
- Shuffle, ShuffleMask);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
- EltNo);
-}
-
// Helper to peek through bitops/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
@@ -35714,7 +36720,7 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
const X86Subtarget &Subtarget) {
// Find the appropriate width for the PSADBW.
EVT InVT = Zext0.getOperand(0).getValueType();
- unsigned RegSize = std::max(128u, InVT.getSizeInBits());
+ unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
// fill in the missing vector elements with 0.
@@ -36263,6 +37269,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ // We need at least SSE2 to anything here.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
ISD::NodeType Opc;
SDValue Rdx =
DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
@@ -36382,8 +37392,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDLoc dl(InputVector);
bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
- if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
+ if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
// Integer Constant Folding.
@@ -36419,14 +37430,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
// TODO - Remove this once we can handle the implicit zero-extension of
- // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
- // combineHorizontalPredicateResult and combineBasicSADPattern.
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
+ // combineBasicSADPattern.
return SDValue();
}
- if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
- return NewOp;
-
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
@@ -36482,7 +37490,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
};
if (all_of(InputVector->uses(), IsBoolExtract) &&
BoolExtracts.size() > 1) {
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
@@ -36568,9 +37575,8 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (TValIsAllZeros || FValIsAllOnes) {
SDValue CC = Cond.getOperand(2);
- ISD::CondCode NewCC =
- ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
- Cond.getOperand(0).getValueType().isInteger());
+ ISD::CondCode NewCC = ISD::getSetCCInverse(
+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
NewCC);
std::swap(LHS, RHS);
@@ -36761,37 +37767,117 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
if (VT.is512BitVector())
return SDValue();
- // TODO: Add other opcodes eventually lowered into BLEND.
- for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
- UI != UE; ++UI)
- if ((UI->getOpcode() != ISD::VSELECT &&
- UI->getOpcode() != X86ISD::BLENDV) ||
- UI.getOperandNo() != 0)
+ auto OnlyUsedAsSelectCond = [](SDValue Cond) {
+ for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+ UI != UE; ++UI)
+ if ((UI->getOpcode() != ISD::VSELECT &&
+ UI->getOpcode() != X86ISD::BLENDV) ||
+ UI.getOperandNo() != 0)
+ return false;
+
+ return true;
+ };
+
+ if (OnlyUsedAsSelectCond(Cond)) {
+ APInt DemandedMask(APInt::getSignMask(BitWidth));
+ KnownBits Known;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
return SDValue();
+ // If we changed the computation somewhere in the DAG, this change will
+ // affect all users of Cond. Update all the nodes so that we do not use
+ // the generic VSELECT anymore. Otherwise, we may perform wrong
+ // optimizations as we messed with the actual expectation for the vector
+ // boolean values.
+ for (SDNode *U : Cond->uses()) {
+ if (U->getOpcode() == X86ISD::BLENDV)
+ continue;
+
+ SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
+ Cond, U->getOperand(1), U->getOperand(2));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+ DCI.AddToWorklist(U);
+ }
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
+ // Otherwise we can still at least try to simplify multiple use bits.
APInt DemandedMask(APInt::getSignMask(BitWidth));
+ APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
- if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,
+ DemandedElts, DAG, 0))
+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ V, N->getOperand(1), N->getOperand(2));
+
+ return SDValue();
+}
+
+// Try to match:
+// (or (and (M, (sub 0, X)), (pandn M, X)))
+// which is a special case of:
+// (select M, (sub 0, X), X)
+// Per:
+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+// We know that, if fNegate is 0 or 1:
+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+//
+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+// ( M ? -X : X) == ((X ^ M ) + (M & 1))
+// This lets us transform our vselect to:
+// (add (xor X, M), (and M, 1))
+// And further to:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoConditionalNegate(
+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ EVT MaskVT = Mask.getValueType();
+ assert(MaskVT.isInteger() &&
+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
+ "Mask must be zero/all-bits");
+
+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+ return SDValue();
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
return SDValue();
- // If we changed the computation somewhere in the DAG, this change will
- // affect all users of Cond. Update all the nodes so that we do not use
- // the generic VSELECT anymore. Otherwise, we may perform wrong
- // optimizations as we messed with the actual expectation for the vector
- // boolean values.
- for (SDNode *U : Cond->uses()) {
- if (U->getOpcode() == X86ISD::BLENDV)
- continue;
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
- SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
- Cond, U->getOperand(1), U->getOperand(2));
- DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
- DCI.AddToWorklist(U);
- }
- DCI.CommitTargetLoweringOpt(TLO);
- return SDValue(N, 0);
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+ else
+ return SDValue();
+
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
}
/// Do target-specific dag combines on SELECT and VSELECT nodes.
@@ -36811,10 +37897,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
+
+ // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
+ // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
+ // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
+ if (CondVT.isVector() && CondVT.isInteger() &&
+ CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
+ (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
+ DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
+ if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
+ DL, DAG, Subtarget))
+ return V;
// Convert vselects with constant condition into shuffles.
- if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
- DCI.isBeforeLegalizeOps()) {
+ if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
SmallVector<int, 64> Mask;
if (createShuffleMaskFromVSELECT(Mask, Cond))
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
@@ -36843,7 +37940,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS)))
break;
@@ -36854,7 +37951,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
case ISD::SETOLE:
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMIN;
@@ -36873,7 +37970,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
case ISD::SETOGE:
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMAX;
@@ -36883,7 +37980,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS)))
break;
@@ -36911,7 +38008,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS))) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
@@ -36922,8 +38019,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
break;
case ISD::SETUGT:
// Converting this to a min would handle NaNs incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
- (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
Opcode = X86ISD::FMIN;
break;
@@ -36948,7 +38044,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) &&
!DAG.isKnownNeverZeroFloat(RHS)) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
@@ -37093,7 +38189,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SDValue Other;
if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
Other = RHS;
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
Other = LHS;
}
@@ -37165,7 +38261,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SDValue Other;
if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
Other = RHS;
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
Other = LHS;
}
@@ -37788,7 +38884,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
}
/// Different mul shrinking modes.
-enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
EVT VT = N->getOperand(0).getValueType();
@@ -37809,16 +38905,16 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
// When ranges are from -128 ~ 127, use MULS8 mode.
if (MinSignBits >= 25)
- Mode = MULS8;
+ Mode = ShrinkMode::MULS8;
// When ranges are from 0 ~ 255, use MULU8 mode.
else if (AllPositive && MinSignBits >= 24)
- Mode = MULU8;
+ Mode = ShrinkMode::MULU8;
// When ranges are from -32768 ~ 32767, use MULS16 mode.
else if (MinSignBits >= 17)
- Mode = MULS16;
+ Mode = ShrinkMode::MULS16;
// When ranges are from 0 ~ 65535, use MULU16 mode.
else if (AllPositive && MinSignBits >= 16)
- Mode = MULU16;
+ Mode = ShrinkMode::MULU16;
else
return false;
return true;
@@ -37888,15 +38984,17 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
// lower part is needed.
SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
- if (Mode == MULU8 || Mode == MULS8)
- return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
+ return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
+ : ISD::SIGN_EXTEND,
DL, VT, MulLo);
MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
- SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- ReducedVT, NewN0, NewN1);
+ SDValue MulHi =
+ DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ ReducedVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
// result.
@@ -38294,7 +39392,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
// We shift all of the values by one. In many cases we do not have
// hardware support for this operation. This is better expressed as an ADD
// of two values.
- if (N1SplatC->getAPIntValue() == 1)
+ if (N1SplatC->isOne())
return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
@@ -38546,15 +39644,15 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
"Unexpected value type");
- assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
+ assert(N->getOperand(1).getValueType() == MVT::i8 &&
+ "Unexpected shift amount type");
// Out of range logical bit shifts are guaranteed to be zero.
// Out of range arithmetic bit shifts splat the sign bit.
- unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
+ unsigned ShiftVal = N->getConstantOperandVal(1);
if (ShiftVal >= NumBitsPerElt) {
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
@@ -39094,6 +40192,71 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
}
+
+// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
+// Where C is a mask containing the same number of bits as the setcc and
+// where the setcc will freely 0 upper bits of k-register. We can replace the
+// undef in the concat with 0s and remove the AND. This mainly helps with
+// v2i1/v4i1 setcc being casted to scalar.
+static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
+
+ EVT VT = N->getValueType(0);
+
+ // Make sure this is an AND with constant. We will check the value of the
+ // constant later.
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return SDValue();
+
+ // This is implied by the ConstantSDNode.
+ assert(!VT.isVector() && "Expected scalar VT!");
+
+ if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
+ !N->getOperand(0).hasOneUse() ||
+ !N->getOperand(0).getOperand(0).hasOneUse())
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Src = N->getOperand(0).getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
+ !TLI.isTypeLegal(SrcVT))
+ return SDValue();
+
+ if (Src.getOpcode() != ISD::CONCAT_VECTORS)
+ return SDValue();
+
+ // We only care about the first subvector of the concat, we expect the
+ // other subvectors to be ignored due to the AND if we make the change.
+ SDValue SubVec = Src.getOperand(0);
+ EVT SubVecVT = SubVec.getValueType();
+
+ // First subvector should be a setcc with a legal result type. The RHS of the
+ // AND should be a mask with this many bits.
+ if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
+ !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
+ return SDValue();
+
+ EVT SetccVT = SubVec.getOperand(0).getValueType();
+ if (!TLI.isTypeLegal(SetccVT) ||
+ !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
+ return SDValue();
+
+ if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
+ return SDValue();
+
+ // We passed all the checks. Rebuild the concat_vectors with zeroes
+ // and cast it back to VT.
+ SDLoc dl(N);
+ SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
+ DAG.getConstant(0, dl, SubVecVT));
+ Ops[0] = SubVec;
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
+ Ops);
+ return DAG.getBitcast(VT, Concat);
+}
+
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -39132,9 +40295,12 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
SrcOps.size() == 1) {
SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
APInt AllBits = APInt::getAllOnesValue(NumElts);
return DAG.getSetCC(dl, MVT::i1, Mask,
@@ -39143,6 +40309,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
+ return V;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -39290,68 +40459,6 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
return true;
}
-// Try to match:
-// (or (and (M, (sub 0, X)), (pandn M, X)))
-// which is a special case of vselect:
-// (vselect M, (sub 0, X), X)
-// Per:
-// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
-// We know that, if fNegate is 0 or 1:
-// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
-//
-// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
-// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
-// ( M ? -X : X) == ((X ^ M ) + (M & 1))
-// This lets us transform our vselect to:
-// (add (xor X, M), (and M, 1))
-// And further to:
-// (sub (xor X, M), M)
-static SDValue combineLogicBlendIntoConditionalNegate(
- EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
- SelectionDAG &DAG, const X86Subtarget &Subtarget) {
- EVT MaskVT = Mask.getValueType();
- assert(MaskVT.isInteger() &&
- DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
- "Mask must be zero/all-bits");
-
- if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
- return SDValue();
- if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
- return SDValue();
-
- auto IsNegV = [](SDNode *N, SDValue V) {
- return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
- ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
- };
-
- SDValue V;
- if (IsNegV(Y.getNode(), X))
- V = X;
- else if (IsNegV(X.getNode(), Y))
- V = Y;
- else
- return SDValue();
-
- SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
- SDValue SubOp2 = Mask;
-
- // If the negate was on the false side of the select, then
- // the operands of the SUB need to be swapped. PR 27251.
- // This is because the pattern being matched above is
- // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
- // but if the pattern matched was
- // (vselect M, X, (sub (0, X))), that is really negation of the pattern
- // above, -(vselect M, (sub 0, X), X), and therefore the replacement
- // pattern also needs to be a negation of the replacement pattern above.
- // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
- // sub accomplishes the negation of the replacement pattern.
- if (V == Y)
- std::swap(SubOp1, SubOp2);
-
- SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
- return DAG.getBitcast(VT, Res);
-}
-
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
@@ -39512,66 +40619,20 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return Ret;
}
-static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If this is SSE1 only convert to FOR to avoid scalarization.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
- return DAG.getBitcast(MVT::v4i32,
- DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
- DAG.getBitcast(MVT::v4f32, N0),
- DAG.getBitcast(MVT::v4f32, N1)));
- }
-
- // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
- // TODO: Support multiple SrcOps.
- if (VT == MVT::i1) {
- SmallVector<SDValue, 2> SrcOps;
- if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
- SrcOps.size() == 1) {
- SDLoc dl(N);
- unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
- EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
- SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
- if (Mask) {
- APInt AllBits = APInt::getNullValue(NumElts);
- return DAG.getSetCC(dl, MVT::i1, Mask,
- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
- }
- }
- }
-
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
- return R;
-
- if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
- return FPLogic;
-
- if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
- return R;
-
- if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
- return R;
-
- // Attempt to recursively combine an OR of shuffles.
- if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
- SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
- return Res;
- }
-
- if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
+ !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
@@ -39589,11 +40650,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
+ EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+
SDValue ShAmt0 = N0.getOperand(1);
- if (ShAmt0.getValueType() != MVT::i8)
+ if (ShAmt0.getValueType() != ShiftVT)
return SDValue();
SDValue ShAmt1 = N1.getOperand(1);
- if (ShAmt1.getValueType() != MVT::i8)
+ if (ShAmt1.getValueType() != ShiftVT)
return SDValue();
// Peek through any modulo shift masks.
@@ -39628,12 +40691,12 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
std::swap(ShMsk0, ShMsk1);
}
- auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
- SDValue Amt) {
+ auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
+ SDValue Amt) {
if (Opc == ISD::FSHR)
std::swap(Op0, Op1);
return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
+ DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
};
// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
@@ -39674,7 +40737,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandAPInt(1) == 1) {
+ Op1.getConstantOperandAPInt(1).isOneValue()) {
return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
@@ -39689,6 +40752,70 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // If this is SSE1 only convert to FOR to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(MVT::v4i32,
+ DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N0),
+ DAG.getBitcast(MVT::v4f32, N1)));
+ }
+
+ // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
+ if (Mask) {
+ APInt AllBits = APInt::getNullValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+ }
+ }
+ }
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
+ return R;
+
+ // Attempt to recursively combine an OR of shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
/// Try to turn tests against the signbit in the form of:
/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
/// into:
@@ -39758,8 +40885,8 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
default: return SDValue();
case MVT::v16i8:
case MVT::v8i16:
- case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
- case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
+ case MVT::v4i32:
+ case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
@@ -39783,7 +40910,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
// Create a greater-than comparison against -1. We don't use the more obvious
// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
- return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
+ return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
}
/// Detect patterns of truncation with unsigned saturation:
@@ -39950,7 +41077,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
- unsigned TruncOpc;
+ unsigned TruncOpc = 0;
SDValue SatVal;
if (auto SSatVal = detectSSatPattern(In, VT)) {
SatVal = SSatVal;
@@ -40252,6 +41379,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
@@ -40279,6 +41407,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
return SDValue();
@@ -40314,10 +41443,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
- SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
- ML->getMask(), DAG.getUNDEF(VT),
- ML->getMemoryVT(), ML->getMemOperand(),
- ML->getExtensionType());
+ SDValue NewML = DAG.getMaskedLoad(
+ VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
+ DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
+ ML->getAddressingMode(), ML->getExtensionType());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
ML->getPassThru());
@@ -40403,8 +41532,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
Mst->getMemoryVT())) {
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ Mst->getBasePtr(), Mst->getOffset(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode(), true);
}
return SDValue();
@@ -40593,59 +41723,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
cast<LoadSDNode>(St->getValue())->isSimple() &&
St->getChain().hasOneUse() && St->isSimple()) {
LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
- SmallVector<SDValue, 8> Ops;
if (!ISD::isNormalLoad(Ld))
return SDValue();
- // If this is not the MMX case, i.e. we are just turning i64 load/store
- // into f64 load/store, avoid the transformation if there are multiple
- // uses of the loaded value.
- if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ // Avoid the transformation if there are multiple uses of the loaded value.
+ if (!Ld->hasNUsesOfValue(1, 0))
return SDValue();
SDLoc LdDL(Ld);
SDLoc StDL(N);
- // If we are a 64-bit capable x86, lower to a single movq load/store pair.
- // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
- // pair instead.
- if (Subtarget.is64Bit() || F64IsLegal) {
- MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
- SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
- Ld->getMemOperand());
-
- // Make sure new load is placed in same chain order.
- DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
- return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
- St->getMemOperand());
- }
-
- // Otherwise, lower to two pairs of 32-bit loads / stores.
- SDValue LoAddr = Ld->getBasePtr();
- SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
-
- SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
- Ld->getPointerInfo(), Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
- Ld->getPointerInfo().getWithOffset(4),
- MinAlign(Ld->getAlignment(), 4),
- Ld->getMemOperand()->getFlags());
- // Make sure new loads are placed in same chain order.
- DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
- DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
-
- LoAddr = St->getBasePtr();
- HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
-
- SDValue LoSt =
- DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
- SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
- St->getPointerInfo().getWithOffset(4),
- MinAlign(St->getAlignment(), 4),
- St->getMemOperand()->getFlags());
- return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ // Lower to a single movq load/store pair.
+ SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getMemOperand());
+
+ // Make sure new load is placed in same chain order.
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
+ return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
+ St->getMemOperand());
}
// This is similar to the above case, but here we handle a scalar 64-bit
@@ -41351,23 +42446,25 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
EVT VT = Op->getValueType(0);
- // Make sure the element size does't change.
+
+ // Make sure the element size doesn't change.
if (VT.getScalarSizeInBits() != ScalarSize)
return SDValue();
- if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::VECTOR_SHUFFLE: {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
- if (!SVOp->getOperand(1).isUndef())
+ if (!Op.getOperand(1).isUndef())
return SDValue();
- if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1))
+ if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
- return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
- SVOp->getMask());
- return SDValue();
+ return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
+ cast<ShuffleVectorSDNode>(Op)->getMask());
+ break;
}
- unsigned Opc = Op.getOpcode();
- if (Opc == ISD::INSERT_VECTOR_ELT) {
+ case ISD::INSERT_VECTOR_ELT: {
// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
// -V, INDEX).
SDValue InsVector = Op.getOperand(0);
@@ -41378,34 +42475,35 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
NegInsVal, Op.getOperand(2));
- return SDValue();
+ break;
}
+ case ISD::FSUB:
+ case ISD::XOR:
+ case X86ISD::FXOR: {
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op0 = Op.getOperand(0);
- if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
- return SDValue();
-
- SDValue Op1 = Op.getOperand(1);
- SDValue Op0 = Op.getOperand(0);
-
- // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
- // masks. For FSUB, we have to check if constant bits of Op0 are sign bit
- // masks and hence we swap the operands.
- if (Opc == ISD::FSUB)
- std::swap(Op0, Op1);
+ // For XOR and FXOR, we want to check if constant
+ // bits of Op1 are sign bit masks. For FSUB, we
+ // have to check if constant bits of Op0 are sign
+ // bit masks and hence we swap the operands.
+ if (Opc == ISD::FSUB)
+ std::swap(Op0, Op1);
- APInt UndefElts;
- SmallVector<APInt, 16> EltBits;
- // Extract constant bits and see if they are all sign bit masks. Ignore the
- // undef elements.
- if (getTargetConstantBitsFromNode(Op1, ScalarSize,
- UndefElts, EltBits,
- /* AllowWholeUndefs */ true,
- /* AllowPartialUndefs */ false)) {
- for (unsigned I = 0, E = EltBits.size(); I < E; I++)
- if (!UndefElts[I] && !EltBits[I].isSignMask())
- return SDValue();
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ // Extract constant bits and see if they are all
+ // sign bit masks. Ignore the undef elements.
+ if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
+ /* AllowWholeUndefs */ true,
+ /* AllowPartialUndefs */ false)) {
+ for (unsigned I = 0, E = EltBits.size(); I < E; I++)
+ if (!UndefElts[I] && !EltBits[I].isSignMask())
+ return SDValue();
- return peekThroughBitcasts(Op0);
+ return peekThroughBitcasts(Op0);
+ }
+ }
}
return SDValue();
@@ -41642,8 +42740,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
return SDValue();
SDValue LHS = N->getOperand(0);
- auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
+ if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
return SDValue();
X86::CondCode NewCC = X86::GetOppositeBranchCondition(
@@ -41817,8 +42914,9 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
- // Only perform optimizations if UnsafeMath is used.
- if (!DAG.getTarget().Options.UnsafeFPMath)
+ // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
+ if (!DAG.getTarget().Options.NoNaNsFPMath ||
+ !DAG.getTarget().Options.NoSignedZerosFPMath)
return SDValue();
// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
@@ -41943,6 +43041,7 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ // FIXME: Handle strict fp nodes.
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
@@ -41951,7 +43050,7 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
- LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ LoadSDNode *LN = cast<LoadSDNode>(In);
// Unless the load is volatile or atomic.
if (LN->isSimple()) {
SDLoc dl(N);
@@ -42569,6 +43668,44 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
+/// recognizable memcmp expansion.
+static bool isOrXorXorTree(SDValue X, bool Root = true) {
+ if (X.getOpcode() == ISD::OR)
+ return isOrXorXorTree(X.getOperand(0), false) &&
+ isOrXorXorTree(X.getOperand(1), false);
+ if (Root)
+ return false;
+ return X.getOpcode() == ISD::XOR;
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
+/// expansion.
+template<typename F>
+static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
+ EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
+ SDValue Op0 = X.getOperand(0);
+ SDValue Op1 = X.getOperand(1);
+ if (X.getOpcode() == ISD::OR) {
+ SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
+ SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
+ if (VecVT != CmpVT)
+ return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
+ if (HasPT)
+ return DAG.getNode(ISD::OR, DL, VecVT, A, B);
+ return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
+ } else if (X.getOpcode() == ISD::XOR) {
+ SDValue A = SToV(Op0);
+ SDValue B = SToV(Op1);
+ if (VecVT != CmpVT)
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
+ if (HasPT)
+ return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+ }
+ llvm_unreachable("Impossible");
+}
+
/// Try to map a 128-bit or larger integer comparison to vector instructions
/// before type legalization splits it up into chunks.
static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
@@ -42589,10 +43726,8 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
// logically-combined vector-sized operands compared to zero. This pattern may
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
- bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
- X.getOperand(0).getOpcode() == ISD::XOR &&
- X.getOperand(1).getOpcode() == ISD::XOR;
- if (isNullConstant(Y) && !IsOrXorXorCCZero)
+ bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
+ if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
@@ -42602,66 +43737,102 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
X.getOpcode() == ISD::LOAD;
};
if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
- !IsOrXorXorCCZero)
+ !IsOrXorXorTreeCCZero)
return SDValue();
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
bool HasAVX = Subtarget.hasAVX();
- // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512.
+ // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
+ // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && HasAVX) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
+
+ // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
+ // vector registers are essentially free. (Technically, widening registers
+ // prevents load folding, but the tradeoff is worth it.)
+ bool PreferKOT = Subtarget.preferMaskRegisters();
+ bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
+
EVT VecVT = MVT::v16i8;
- EVT CmpVT = MVT::v16i8;
- if (OpSize == 256)
- VecVT = CmpVT = MVT::v32i8;
- if (OpSize == 512) {
+ EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
+ if (OpSize == 256) {
+ VecVT = MVT::v32i8;
+ CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
+ }
+ EVT CastVT = VecVT;
+ bool NeedsAVX512FCast = false;
+ if (OpSize == 512 || NeedZExt) {
if (Subtarget.hasBWI()) {
VecVT = MVT::v64i8;
CmpVT = MVT::v64i1;
+ if (OpSize == 512)
+ CastVT = VecVT;
} else {
VecVT = MVT::v16i32;
CmpVT = MVT::v16i1;
+ CastVT = OpSize == 512 ? VecVT :
+ OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
+ NeedsAVX512FCast = true;
+ }
+ }
+
+ auto ScalarToVector = [&](SDValue X) -> SDValue {
+ bool TmpZext = false;
+ EVT TmpCastVT = CastVT;
+ if (X.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue OrigX = X.getOperand(0);
+ unsigned OrigSize = OrigX.getScalarValueSizeInBits();
+ if (OrigSize < OpSize) {
+ if (OrigSize == 128) {
+ TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
+ X = OrigX;
+ TmpZext = true;
+ } else if (OrigSize == 256) {
+ TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
+ X = OrigX;
+ TmpZext = true;
+ }
+ }
}
- }
+ X = DAG.getBitcast(TmpCastVT, X);
+ if (!NeedZExt && !TmpZext)
+ return X;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
+ DAG.getConstant(0, DL, VecVT), X,
+ DAG.getConstant(0, DL, VecIdxVT));
+ };
SDValue Cmp;
- if (IsOrXorXorCCZero) {
+ if (IsOrXorXorTreeCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
// Use 2 vector equality compares and 'and' the results before doing a
// MOVMSK.
- SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
- SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
- SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
- SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
- if (VecVT == CmpVT && HasPT) {
- SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B);
- SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D);
- Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2);
- } else {
- SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
- SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
- Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
- }
+ Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
} else {
- SDValue VecX = DAG.getBitcast(VecVT, X);
- SDValue VecY = DAG.getBitcast(VecVT, Y);
- if (VecVT == CmpVT && HasPT) {
+ SDValue VecX = ScalarToVector(X);
+ SDValue VecY = ScalarToVector(Y);
+ if (VecVT != CmpVT) {
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
+ } else if (HasPT) {
Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
} else {
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
}
}
- // For 512-bits we want to emit a setcc that will lower to kortest.
+ // AVX512 should emit a setcc that will lower to kortest.
if (VecVT != CmpVT) {
- EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16;
- SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT);
- return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC);
+ EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
+ CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
+ return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
+ DAG.getConstant(0, DL, KRegVT), CC);
}
if (HasPT) {
SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
@@ -42687,9 +43858,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
+ const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ const SDValue LHS = N->getOperand(0);
+ const SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT OpVT = LHS.getValueType();
SDLoc DL(N);
@@ -42716,30 +43887,35 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
- // Put build_vectors on the right.
- if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
- std::swap(LHS, RHS);
- CC = ISD::getSetCCSwappedOperands(CC);
+ // Using temporaries to avoid messing up operand ordering for later
+ // transformations if this doesn't work.
+ SDValue Op0 = LHS;
+ SDValue Op1 = RHS;
+ ISD::CondCode TmpCC = CC;
+ // Put build_vector on the right.
+ if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
+ std::swap(Op0, Op1);
+ TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
}
bool IsSEXT0 =
- (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
- bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+ (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
+ (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
if (IsSEXT0 && IsVZero1) {
- assert(VT == LHS.getOperand(0).getValueType() &&
+ assert(VT == Op0.getOperand(0).getValueType() &&
"Uexpected operand type");
- if (CC == ISD::SETGT)
+ if (TmpCC == ISD::SETGT)
return DAG.getConstant(0, DL, VT);
- if (CC == ISD::SETLE)
+ if (TmpCC == ISD::SETLE)
return DAG.getConstant(1, DL, VT);
- if (CC == ISD::SETEQ || CC == ISD::SETGE)
- return DAG.getNOT(DL, LHS.getOperand(0), VT);
+ if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
+ return DAG.getNOT(DL, Op0.getOperand(0), VT);
- assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+ assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
"Unexpected condition code!");
- return LHS.getOperand(0);
+ return Op0.getOperand(0);
}
}
@@ -42752,8 +43928,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
VT.getVectorElementType() == MVT::i1 &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
- SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
- N->getOperand(2));
+ SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
}
@@ -42985,16 +44160,18 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
- if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
- N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
- VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
+ if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
+ Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
+ VT.getSizeInBits() != Op0.getValueSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
- if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
@@ -43004,12 +44181,19 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
- SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+ SDValue SourceConst;
+ if (IsStrict)
+ SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
+ {N->getOperand(0), SDValue(BV, 0)});
+ else
+ SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
- SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
- N->getOperand(0)->getOperand(0), MaskConst);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
+ MaskConst);
SDValue Res = DAG.getBitcast(VT, NewAnd);
+ if (IsStrict)
+ return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
return Res;
}
@@ -43053,7 +44237,8 @@ static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDValue Op0 = N->getOperand(0);
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
@@ -43067,14 +44252,21 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), P});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
// the optimization here.
- if (DAG.SignBitIsZero(Op0))
+ if (DAG.SignBitIsZero(Op0)) {
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
+ {N->getOperand(0), Op0});
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+ }
return SDValue();
}
@@ -43084,11 +44276,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
+ bool IsStrict = N->isStrictFPOpcode();
if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
return Res;
// Now move on to more general possibilities.
- SDValue Op0 = N->getOperand(0);
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
@@ -43100,6 +44293,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), P});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
@@ -43117,6 +44313,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), Trunc});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
}
// If we're after legalize and the type is v2i32 we need to shuffle and
@@ -43125,6 +44324,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
{ 0, 2, -1, -1 });
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
+ {N->getOperand(0), Shuf});
return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
}
}
@@ -43148,13 +44350,16 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (Ld->isSimple() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!Subtarget.is64Bit() && LdVT == MVT::i64) {
- SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
+ std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
- DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
- return FILDChain;
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
+ return Tmp.first;
}
}
+ if (IsStrict)
+ return SDValue();
+
if (SDValue V = combineToFPTruncExtElt(N, DAG))
return V;
@@ -43579,7 +44784,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
auto UsePMADDWD = [&](SDValue Op) {
ShrinkMode Mode;
return Op.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 &&
+ canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
+ Mode != ShrinkMode::MULU16 &&
(!Subtarget.hasSSE41() ||
(Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
Op->isOnlyUserOf(Op.getOperand(1).getNode())));
@@ -43784,7 +44990,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
// Check if the Mul source can be safely shrunk.
ShrinkMode Mode;
- if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
+ if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
+ Mode == ShrinkMode::MULU16)
return SDValue();
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
@@ -44468,7 +45675,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue InVec = N->getOperand(0);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
- EVT InVecBCVT = InVecBC.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
@@ -44512,31 +45718,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
VT, SDLoc(N),
InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
- // Try to move vector bitcast after extract_subv by scaling extraction index:
- // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
- // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
- if (InVec != InVecBC && InVecBCVT.isVector()) {
- unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
- unsigned DestNumElts = InVecVT.getVectorNumElements();
- if ((DestNumElts % SrcNumElts) == 0) {
- unsigned DestSrcRatio = DestNumElts / SrcNumElts;
- if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
- unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
- EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
- InVecBCVT.getScalarType(), NewExtNumElts);
- if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
- TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
- unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
- SDLoc DL(N);
- SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
- SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
- InVecBC, NewIndex);
- return DAG.getBitcast(VT, NewExtract);
- }
- }
- }
- }
-
// If we are extracting from an insert into a zero vector, replace with a
// smaller insert into zero if we don't access less than the original
// subvector. Don't do this for i1 vectors.
@@ -44583,7 +45764,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTUDQ2PD(v4i32).
- if (InOpcode == ISD::UINT_TO_FP &&
+ if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
}
@@ -44751,6 +45932,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return DAG.getConstant(0, SDLoc(N), VT);
+
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
@@ -44802,8 +45986,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
- case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget);
- case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ return combineSIntToFP(N, DAG, DCI, Subtarget);
+ case ISD::UINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP:
+ return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);