src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
commit	706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree	4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/X86/X86ISelLowering.cpp
parent	7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
download	src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.tar.gz src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.zip

Vendor import of llvm-project master e26a78e70, the last commit beforevendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085

the llvmorg-11-init tag, from which release/10.x was branched.

Notes

Notes: svn path=/vendor/llvm-project/master/; revision=356843 svn path=/vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085/; revision=356844; tag=vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085

Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')

-rw-r--r--

llvm/lib/Target/X86/X86ISelLowering.cpp

4018

1 files changed, 2603 insertions, 1415 deletions

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ed975e9248a8..0f152968ddfd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -25,7 +25,9 @@

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/ADT/StringSwitch.h"

+#include "llvm/Analysis/BlockFrequencyInfo.h"

#include "llvm/Analysis/EHPersonalities.h"

+#include "llvm/Analysis/ProfileSummaryInfo.h"

#include "llvm/CodeGen/IntrinsicLowering.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

@@ -154,17 +156,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);

}

- if (Subtarget.isTargetDarwin()) {

- // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.

- setUseUnderscoreSetJmp(false);

- setUseUnderscoreLongJmp(false);

- } else if (Subtarget.isTargetWindowsGNU()) {

- // MS runtime is weird: it exports _setjmp, but longjmp!

- setUseUnderscoreSetJmp(true);

- setUseUnderscoreLongJmp(false);

- } else {

- setUseUnderscoreSetJmp(true);

- setUseUnderscoreLongJmp(true);

+ if (Subtarget.getTargetTriple().isOSMSVCRT()) {

+ // MSVCRT doesn't have powi; fall back to pow

+ setLibcallName(RTLIB::POWI_F32, nullptr);

+ setLibcallName(RTLIB::POWI_F64, nullptr);

}

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

@@ -217,72 +212,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ShiftOp , MVT::i64 , Custom);

}

- // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

- // operation.

- setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);

- setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);

- setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

if (!Subtarget.useSoftFloat()) {

- // We have an algorithm for SSE2->double, and we turn this into a

- // 64-bit FILD followed by conditional FADD for other targets.

- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);

+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

+ // operation.

+ setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

// We have an algorithm for SSE2, and we turn this into a 64-bit

// FILD or VCVTUSI2SS/SD for other targets.

- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);

- } else {

- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);

- }

- // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

- // this operation.

- setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);

- setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

- if (!Subtarget.useSoftFloat()) {

- // SSE has no i16 to fp conversion, only i32.

- if (X86ScalarSSEf32) {

- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);

- // f32 and f64 cases are Legal, f80 case is not

- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);

- } else {

- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);

- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);

- }

- } else {

- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);

- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);

- }

- // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

- // this operation.

- setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);

- setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

- if (!Subtarget.useSoftFloat()) {

+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

+ // We have an algorithm for SSE2->double, and we turn this into a

+ // 64-bit FILD followed by conditional FADD for other targets.

+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

+ // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

+ // this operation.

+ setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

+ // SSE has no i16 to fp conversion, only i32. We promote in the handler

+ // to allow f80 to use i16 and f64 to use i16 with sse1 only

+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

+ // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

// are Legal, f80 is custom lowered.

- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);

- setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);

- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);

- } else {

- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);

- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);

- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);

- }

- // Handle FP_TO_UINT by promoting the destination to a larger signed

- // conversion.

- setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);

- setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);

- setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

- if (!Subtarget.useSoftFloat()) {

- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

- }

+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

+ // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

+ // this operation.

+ setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

+ // FIXME: This doesn't generate invalid exception when it should. PR44019.

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

+ // are Legal, f80 is custom lowered.

+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

+ // Handle FP_TO_UINT by promoting the destination to a larger signed

+ // conversion.

+ setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

+ // FIXME: This doesn't generate invalid exception when it should. PR44019.

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

+ // FIXME: This doesn't generate invalid exception when it should. PR44019.

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

+ }

+ // Handle address space casts between mixed sized pointers.

+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

if (!X86ScalarSSEf64) {

@@ -409,12 +401,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (!Subtarget.hasMOVBE())

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

- // These should be promoted to a larger select which is supported.

- setOperationAction(ISD::SELECT , MVT::i1 , Promote);

// X86 wants to expand cmov itself.

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

setOperationAction(ISD::SELECT, VT, Custom);

setOperationAction(ISD::SETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

}

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

if (VT == MVT::i64 && !Subtarget.is64Bit())

@@ -619,6 +611,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

} else // SSE immediates.

addLegalFPImmediate(APFloat(+0.0)); // xorpd

}

+ // Handle constrained floating-point operations of scalar.

+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

// We don't support FMA.

setOperationAction(ISD::FMA, MVT::f64, Expand);

@@ -659,6 +665,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

setOperationAction(ISD::LRINT, MVT::f80, Expand);

setOperationAction(ISD::LLRINT, MVT::f80, Expand);

+ // Handle constrained floating-point operations of scalar.

+ setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

+ setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

+ setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

+ setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

+ setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

+ // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

+ // as Custom.

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

}

// f128 uses xmm registers, but most operations require libcalls.

@@ -668,22 +685,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

- setOperationAction(ISD::FADD, MVT::f128, Custom);

- setOperationAction(ISD::FSUB, MVT::f128, Custom);

- setOperationAction(ISD::FDIV, MVT::f128, Custom);

- setOperationAction(ISD::FMUL, MVT::f128, Custom);

- setOperationAction(ISD::FMA, MVT::f128, Expand);

+ setOperationAction(ISD::FADD, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

+ setOperationAction(ISD::FDIV, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

+ setOperationAction(ISD::FMUL, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

+ setOperationAction(ISD::FMA, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

setOperationAction(ISD::FABS, MVT::f128, Custom);

setOperationAction(ISD::FNEG, MVT::f128, Custom);

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

- setOperationAction(ISD::FSIN, MVT::f128, Expand);

- setOperationAction(ISD::FCOS, MVT::f128, Expand);

- setOperationAction(ISD::FSINCOS, MVT::f128, Expand);

- setOperationAction(ISD::FSQRT, MVT::f128, Expand);

- setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

+ setOperationAction(ISD::FSIN, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

+ setOperationAction(ISD::FCOS, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

+ setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

+ // No STRICT_FSINCOS

+ setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

// We need to custom handle any FP_ROUND with an f128 input, but

// LegalizeDAG uses the result type to know when to run a custom handler.

// So we have to list all legal floating point result types here.

@@ -820,12 +847,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom);

+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

}

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

@@ -895,6 +925,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

setOperationAction(ISD::SETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

setOperationAction(ISD::CTPOP, VT, Custom);

setOperationAction(ISD::ABS, VT, Custom);

@@ -933,37 +965,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

// Custom legalize these to avoid over promotion or custom promotion.

- setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);

- setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);

- setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);

- setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);

- setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);

- // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into

- // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is

- // split again based on the input type, this will cause an AssertSExt i16 to

- // be emitted instead of an AssertZExt. This will allow packssdw followed by

- // packuswb to be used to truncate to v8i8. This is necessary since packusdw

- // isn't available until sse4.1.

- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

+ for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);

+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

+ }

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

// We want to legalize this to an f64 load rather than an i64 load on

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

@@ -1008,6 +1041,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// With AVX512, expanding (and promoting the shifts) is better.

if (!Subtarget.hasAVX512())

setOperationAction(ISD::ROTL, MVT::v16i8, Custom);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

}

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

@@ -1029,11 +1068,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

- setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

- setOperationAction(ISD::FCEIL, RoundedTy, Legal);

- setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

- setOperationAction(ISD::FRINT, RoundedTy, Legal);

- setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

+ setOperationAction(ISD::FRINT, RoundedTy, Legal);

+ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

+ setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

}

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

@@ -1072,6 +1116,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// i8 vectors are custom because the source register and source

// source memory operand types are not the same width.

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

+ if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

+ // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

+ // do the pre and post work in the vector domain.

+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

+ // We need to mark SINT_TO_FP as Custom even though we want to expand it

+ // so that DAG combine doesn't try to turn it into uint_to_fp.

+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

+ }

}

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

@@ -1105,25 +1160,45 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

: &X86::VR256RegClass);

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

- setOperationAction(ISD::FFLOOR, VT, Legal);

- setOperationAction(ISD::FCEIL, VT, Legal);

- setOperationAction(ISD::FTRUNC, VT, Legal);

- setOperationAction(ISD::FRINT, VT, Legal);

- setOperationAction(ISD::FNEARBYINT, VT, Legal);

- setOperationAction(ISD::FNEG, VT, Custom);

- setOperationAction(ISD::FABS, VT, Custom);

- setOperationAction(ISD::FCOPYSIGN, VT, Custom);

+ setOperationAction(ISD::FFLOOR, VT, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

+ setOperationAction(ISD::FCEIL, VT, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

+ setOperationAction(ISD::FTRUNC, VT, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

+ setOperationAction(ISD::FRINT, VT, Legal);

+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);

+ setOperationAction(ISD::FNEARBYINT, VT, Legal);

+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

+ setOperationAction(ISD::FNEG, VT, Custom);

+ setOperationAction(ISD::FABS, VT, Custom);

+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);

}

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

// even though v8i16 is a legal type.

- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

if (!Subtarget.hasAVX512())

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

@@ -1169,6 +1244,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

setOperationAction(ISD::SETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

setOperationAction(ISD::CTPOP, VT, Custom);

setOperationAction(ISD::CTLZ, VT, Custom);

@@ -1180,8 +1257,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (Subtarget.hasAnyFMA()) {

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

- MVT::v2f64, MVT::v4f64 })

+ MVT::v2f64, MVT::v4f64 }) {

setOperationAction(ISD::FMA, VT, Legal);

+ setOperationAction(ISD::STRICT_FMA, VT, Legal);

+ }

}

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

@@ -1233,6 +1312,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

// when we have a 256bit-wide blend with immediate.

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

@@ -1299,12 +1379,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

- setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

// There is no byte sized k-register load or store without AVX512DQ.

if (!Subtarget.hasDQI()) {

@@ -1331,6 +1417,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::SUB, VT, Custom);

setOperationAction(ISD::MUL, VT, Custom);

setOperationAction(ISD::SETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

setOperationAction(ISD::SELECT, VT, Custom);

setOperationAction(ISD::TRUNCATE, VT, Custom);

setOperationAction(ISD::UADDSAT, VT, Custom);

@@ -1372,21 +1460,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::FNEG, VT, Custom);

setOperationAction(ISD::FABS, VT, Custom);

setOperationAction(ISD::FMA, VT, Legal);

+ setOperationAction(ISD::STRICT_FMA, VT, Legal);

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

}

- setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);

- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);

- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);

- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);

- setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);

- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);

- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);

- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);

- setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);

- setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom);

+ for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {

+ setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

+ setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

+ }

+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);

+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);

+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);

+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

@@ -1420,11 +1524,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

- setOperationAction(ISD::FFLOOR, VT, Legal);

- setOperationAction(ISD::FCEIL, VT, Legal);

- setOperationAction(ISD::FTRUNC, VT, Legal);

- setOperationAction(ISD::FRINT, VT, Legal);

- setOperationAction(ISD::FNEARBYINT, VT, Legal);

+ setOperationAction(ISD::FFLOOR, VT, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

+ setOperationAction(ISD::FCEIL, VT, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

+ setOperationAction(ISD::FTRUNC, VT, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

+ setOperationAction(ISD::FRINT, VT, Legal);

+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);

+ setOperationAction(ISD::FNEARBYINT, VT, Legal);

+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

setOperationAction(ISD::SELECT, VT, Custom);

}

@@ -1459,6 +1568,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::ROTL, VT, Custom);

setOperationAction(ISD::ROTR, VT, Custom);

setOperationAction(ISD::SETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

setOperationAction(ISD::SELECT, VT, Custom);

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

@@ -1470,8 +1581,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (Subtarget.hasDQI()) {

setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);

setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);

setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);

setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

}

@@ -1532,13 +1647,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

// These operations are handled on non-VLX by artificially widening in

// isel patterns.

- // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);

- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);

+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,

+ Subtarget.hasVLX() ? Legal : Custom);

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);

- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,

+ Subtarget.hasVLX() ? Legal : Custom);

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

setOperationAction(ISD::SMAX, VT, Legal);

@@ -1563,12 +1690,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (Subtarget.hasDQI()) {

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

- setOperationAction(ISD::SINT_TO_FP, VT, Legal);

- setOperationAction(ISD::UINT_TO_FP, VT, Legal);

- setOperationAction(ISD::FP_TO_SINT, VT, Legal);

- setOperationAction(ISD::FP_TO_UINT, VT, Legal);

- setOperationAction(ISD::MUL, VT, Legal);

+ setOperationAction(ISD::SINT_TO_FP, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::UINT_TO_FP, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::FP_TO_SINT, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::FP_TO_UINT, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT,

+ Subtarget.hasVLX() ? Legal : Custom);

+ setOperationAction(ISD::MUL, VT, Legal);

}

@@ -1739,12 +1877,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (Subtarget.hasDQI()) {

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

// v2f32 UINT_TO_FP is already custom under SSE2.

- setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&

+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&

"Unexpected operation action!");

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

}

if (Subtarget.hasBWI()) {

@@ -1828,8 +1968,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

if (Subtarget.is32Bit() &&

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

for (ISD::NodeType Op :

- {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,

- ISD::FLOG10, ISD::FPOW, ISD::FSIN})

+ {ISD::FCEIL, ISD::STRICT_FCEIL,

+ ISD::FCOS, ISD::STRICT_FCOS,

+ ISD::FEXP, ISD::STRICT_FEXP,

+ ISD::FFLOOR, ISD::STRICT_FFLOOR,

+ ISD::FREM, ISD::STRICT_FREM,

+ ISD::FLOG, ISD::STRICT_FLOG,

+ ISD::FLOG10, ISD::STRICT_FLOG10,

+ ISD::FPOW, ISD::STRICT_FPOW,

+ ISD::FSIN, ISD::STRICT_FSIN})

if (isOperationExpand(Op, MVT::f32))

setOperationAction(Op, MVT::f32, Promote);

@@ -1870,6 +2017,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);

setTargetDAGCombine(ISD::SINT_TO_FP);

setTargetDAGCombine(ISD::UINT_TO_FP);

+ setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);

+ setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);

setTargetDAGCombine(ISD::SETCC);

setTargetDAGCombine(ISD::MUL);

setTargetDAGCombine(ISD::XOR);

@@ -1901,6 +2050,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setPrefFunctionAlignment(Align(16));

verifyIntrinsicTables();

+ // Default to having -disable-strictnode-mutation on

+ IsStrictFPEnabled = true;

}

// This has so far only been implemented for 64-bit MachO.

@@ -1910,7 +2062,7 @@ bool X86TargetLowering::useLoadStackGuardNode() const {

bool X86TargetLowering::useStackGuardXorFP() const {

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

- return Subtarget.getTargetTriple().isOSMSVCRT();

+ return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

}

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

@@ -1946,9 +2098,13 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||

(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))

return MVT::i8;

+ // Split v64i1 vectors if we don't have v64i8 available.

+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

+ CC != CallingConv::X86_RegCall)

+ return MVT::v32i1;

// FIXME: Should we just make these types legal and custom split operations?

- if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&

- Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&

+ Subtarget.useAVX512Regs() && !Subtarget.hasBWI())

return MVT::v16i32;

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

}

@@ -1966,9 +2122,13 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||

(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))

return VT.getVectorNumElements();

+ // Split v64i1 vectors if we don't have v64i8 available.

+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

+ CC != CallingConv::X86_RegCall)

+ return 2;

// FIXME: Should we just make these types legal and custom split operations?

- if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&

- Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&

+ Subtarget.useAVX512Regs() && !Subtarget.hasBWI())

return 1;

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

}

@@ -1988,6 +2148,15 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

return NumIntermediates;

}

+ // Split v64i1 vectors if we don't have v64i8 available.

+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

+ CC != CallingConv::X86_RegCall) {

+ RegisterVT = MVT::v32i1;

+ IntermediateVT = MVT::v32i1;

+ NumIntermediates = 2;

+ return 2;

+ }

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

NumIntermediates, RegisterVT);

}

@@ -2383,6 +2552,10 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,

unsigned DestAS) const {

assert(SrcAS != DestAS && "Expected different address spaces!");

+ const TargetMachine &TM = getTargetMachine();

+ if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))

+ return false;

return SrcAS < 256 && DestAS < 256;

}

@@ -2520,18 +2693,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

assert(VA.getLocInfo() != CCValAssign::FPExt &&

"Unexpected FP-extend for return value.");

- // If this is x86-64, and we disabled SSE, we can't return FP values,

- // or SSE or MMX vectors.

- if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||

- VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&

- (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {

+ // Report an error if we have attempted to return a value via an XMM

+ // register and SSE was disabled.

+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

- } else if (ValVT == MVT::f64 &&

- (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {

- // Likewise we can't return F64 values with SSE1 only. gcc does so, but

- // llvm-gcc has never done it right and no one has noticed, so this

- // should be OK for now.

+ } else if (!Subtarget.hasSSE2() &&

+ X86::FR64XRegClass.contains(VA.getLocReg()) &&

+ ValVT == MVT::f64) {

+ // When returning a double via an XMM register, report an error if SSE2 is

+ // not enabled.

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

}

@@ -2826,7 +2997,6 @@ SDValue X86TargetLowering::LowerCallResult(

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

// Assign locations to each value returned by this call.

SmallVector<CCValAssign, 16> RVLocs;

- bool Is64Bit = Subtarget.is64Bit();

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

*DAG.getContext());

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

@@ -2845,15 +3015,22 @@ SDValue X86TargetLowering::LowerCallResult(

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

}

- // If this is x86-64, and we disabled SSE, we can't return FP values

- if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&

- ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {

+ // Report an error if there was an attempt to return FP values via XMM

+ // registers.

+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

- VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

- } else if (CopyVT == MVT::f64 &&

- (Is64Bit && !Subtarget.hasSSE2())) {

+ if (VA.getLocReg() == X86::XMM1)

+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

+ else

+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

+ } else if (!Subtarget.hasSSE2() &&

+ X86::FR64XRegClass.contains(VA.getLocReg()) &&

+ CopyVT == MVT::f64) {

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

- VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

+ if (VA.getLocReg() == X86::XMM1)

+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

+ else

+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

}

// If we prefer to use the value in xmm registers, copy it out as f80 and

@@ -2895,6 +3072,9 @@ SDValue X86TargetLowering::LowerCallResult(

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

}

+ if (VA.getLocInfo() == CCValAssign::BCvt)

+ Val = DAG.getBitcast(VA.getValVT(), Val);

InVals.push_back(Val);

}

@@ -2993,9 +3173,7 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

}

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

- auto Attr =

- CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");

- if (!CI->isTailCall() || Attr.getValueAsString() == "true")

+ if (!CI->isTailCall())

return false;

ImmutableCallSite CS(CI);

@@ -3464,8 +3642,8 @@ SDValue X86TargetLowering::LowerFormalArguments(

FuncInfo->getForwardedMustTailRegParms();

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

- // Conservatively forward AL on x86_64, since it might be used for varargs.

- if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {

+ // Forward AL for SysV x86_64 targets, since it is used for varargs.

+ if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {

unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

}

@@ -3618,7 +3796,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

CallConv == CallingConv::Tail;

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

- auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");

const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());

const Function *Fn = CI ? CI->getCalledFunction() : nullptr;

bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||

@@ -3634,9 +3811,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

if (CallConv == CallingConv::X86_INTR)

report_fatal_error("X86 interrupts may not be called directly");

- if (Attr.getValueAsString() == "true")

- isTailCall = false;

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {

// If we are using a GOT, disable tail calls to external symbols with

// default visibility. Tail calling such a symbol requires using a GOT

@@ -3728,7 +3902,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

"the only memory argument");

}

- if (!IsSibcall)

+ if (!IsSibcall && !IsMustTail)

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

NumBytes - NumBytesToPush, dl);

@@ -4013,7 +4187,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

SmallVector<SDValue, 8> Ops;

- if (!IsSibcall && isTailCall) {

+ if (!IsSibcall && isTailCall && !IsMustTail) {

Chain = DAG.getCALLSEQ_END(Chain,

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

DAG.getIntPtrConstant(0, dl, true), InFlag, dl);

@@ -4183,23 +4357,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

/// requirement.

unsigned

-X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,

- SelectionDAG& DAG) const {

- const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

- unsigned StackAlignment = TFI.getStackAlignment();

- uint64_t AlignMask = StackAlignment - 1;

- int64_t Offset = StackSize;

- unsigned SlotSize = RegInfo->getSlotSize();

- if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {

- // Number smaller than 12 so just add the difference.

- Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));

- } else {

- // Mask out lower bits, add stackalignment once plus the 12 bytes.

- Offset = ((~AlignMask) & Offset) + StackAlignment +

- (StackAlignment-SlotSize);

- }

- return Offset;

+X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

+ SelectionDAG &DAG) const {

+ const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());

+ const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

+ assert(StackSize % SlotSize == 0 &&

+ "StackSize must be a multiple of SlotSize");

+ return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

}

/// Return true if the given stack call argument is already available in the

@@ -4643,8 +4807,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,

}

-/// Return true if the condition is an unsigned comparison operation.

-static bool isX86CCUnsigned(unsigned X86CC) {

+/// Return true if the condition is an signed comparison operation.

+static bool isX86CCSigned(unsigned X86CC) {

switch (X86CC) {

default:

llvm_unreachable("Invalid integer condition!");

@@ -4654,12 +4818,12 @@ static bool isX86CCUnsigned(unsigned X86CC) {

case X86::COND_A:

case X86::COND_BE:

case X86::COND_AE:

- return true;

+ return false;

case X86::COND_G:

case X86::COND_GE:

case X86::COND_L:

case X86::COND_LE:

- return false;

+ return true;

}

@@ -4700,7 +4864,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

// X >= 0 -> X == 0, jump on !sign.

return X86::COND_NS;

}

- if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {

+ if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

// X < 1 -> X <= 0

RHS = DAG.getConstant(0, DL, RHS.getValueType());

return X86::COND_LE;

@@ -4949,12 +5113,6 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

}

-bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,

- bool IsSigned) const {

- // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.

- return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

unsigned Index) const {

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

@@ -5334,15 +5492,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,

static bool canWidenShuffleElements(ArrayRef<int> Mask,

const APInt &Zeroable,

+ bool V2IsZero,

SmallVectorImpl<int> &WidenedMask) {

- SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());

- for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {

- if (TargetMask[i] == SM_SentinelUndef)

- continue;

- if (Zeroable[i])

- TargetMask[i] = SM_SentinelZero;

+ // Create an alternative mask with info about zeroable elements.

+ // Here we do not set undef elements as zeroable.

+ SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());

+ if (V2IsZero) {

+ assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");

+ for (int i = 0, Size = Mask.size(); i != Size; ++i)

+ if (Mask[i] != SM_SentinelUndef && Zeroable[i])

+ ZeroableMask[i] = SM_SentinelZero;

}

- return canWidenShuffleElements(TargetMask, WidenedMask);

+ return canWidenShuffleElements(ZeroableMask, WidenedMask);

}

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

@@ -5764,11 +5925,29 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

// Widen the vector if needed.

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

- // Clear the upper bits of the subvector and move it to its insert position.

unsigned ShiftLeft = NumElems - SubVecNumElems;

+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

+ // Do an optimization for the the most frequently used types.

+ if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

+ APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

+ Mask0.flipAllBits();

+ SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

+ SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

+ Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

+ // Reduce to original width if needed.

+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

+ }

+ // Clear the upper bits of the subvector and move it to its insert position.

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

- unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

@@ -5850,7 +6029,7 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,

"Expected VTs to be the same size!");

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

In = extractSubVector(In, 0, DAG, DL,

- std::max(128U, VT.getSizeInBits() / Scale));

+ std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

InVT = In.getValueType();

}

@@ -6719,9 +6898,97 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

return true;

}

+/// Compute whether each element of a shuffle is zeroable.

+///

+/// A "zeroable" vector shuffle element is one which can be lowered to zero.

+/// Either it is an undef element in the shuffle mask, the element of the input

+/// referenced is undef, or the element of the input referenced is known to be

+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

+/// as many lanes with this technique as possible to simplify the remaining

+/// shuffle.

+static void computeZeroableShuffleElements(ArrayRef<int> Mask,

+ SDValue V1, SDValue V2,

+ APInt &KnownUndef, APInt &KnownZero) {

+ int Size = Mask.size();

+ KnownUndef = KnownZero = APInt::getNullValue(Size);

+ V1 = peekThroughBitcasts(V1);

+ V2 = peekThroughBitcasts(V2);

+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

+ int VectorSizeInBits = V1.getValueSizeInBits();

+ int ScalarSizeInBits = VectorSizeInBits / Size;

+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

+ for (int i = 0; i < Size; ++i) {

+ int M = Mask[i];

+ // Handle the easy cases.

+ if (M < 0) {

+ KnownUndef.setBit(i);

+ continue;

+ }

+ if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

+ KnownZero.setBit(i);

+ continue;

+ }

+ // Determine shuffle input and normalize the mask.

+ SDValue V = M < Size ? V1 : V2;

+ M %= Size;

+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

+ if (V.getOpcode() != ISD::BUILD_VECTOR)

+ continue;

+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of

+ // the (larger) source element must be UNDEF/ZERO.

+ if ((Size % V.getNumOperands()) == 0) {

+ int Scale = Size / V->getNumOperands();

+ SDValue Op = V.getOperand(M / Scale);

+ if (Op.isUndef())

+ KnownUndef.setBit(i);

+ if (X86::isZeroNode(Op))

+ KnownZero.setBit(i);

+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

+ APInt Val = Cst->getAPIntValue();

+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

+ if (Val == 0)

+ KnownZero.setBit(i);

+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

+ APInt Val = Cst->getValueAPF().bitcastToAPInt();

+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

+ if (Val == 0)

+ KnownZero.setBit(i);

+ }

+ continue;

+ }

+ // If the BUILD_VECTOR has more elements then all the (smaller) source

+ // elements must be UNDEF or ZERO.

+ if ((V.getNumOperands() % Size) == 0) {

+ int Scale = V->getNumOperands() / Size;

+ bool AllUndef = true;

+ bool AllZero = true;

+ for (int j = 0; j < Scale; ++j) {

+ SDValue Op = V.getOperand((M * Scale) + j);

+ AllUndef &= Op.isUndef();

+ AllZero &= X86::isZeroNode(Op);

+ }

+ if (AllUndef)

+ KnownUndef.setBit(i);

+ if (AllZero)

+ KnownZero.setBit(i);

+ continue;

+ }

/// Decode a target shuffle mask and inputs and see if any values are

/// known to be undef or zero from their inputs.

/// Returns true if the target shuffle mask was decoded.

+/// FIXME: Merge this with computeZeroableShuffleElements?

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

SmallVectorImpl<SDValue> &Ops,

APInt &KnownUndef, APInt &KnownZero) {

@@ -6741,7 +7008,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

V1 = peekThroughBitcasts(V1);

V2 = peekThroughBitcasts(V2);

- assert((VT.getSizeInBits() % Mask.size()) == 0 &&

+ assert((VT.getSizeInBits() % Size) == 0 &&

"Illegal split of shuffle value type");

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

@@ -6810,7 +7077,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

// Replace target shuffle mask elements with known undef/zero sentinels.

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

const APInt &KnownUndef,

- const APInt &KnownZero) {

+ const APInt &KnownZero,

+ bool ResolveKnownZeros= true) {

unsigned NumElts = Mask.size();

assert(KnownUndef.getBitWidth() == NumElts &&

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");

@@ -6818,7 +7086,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

for (unsigned i = 0; i != NumElts; ++i) {

if (KnownUndef[i])

Mask[i] = SM_SentinelUndef;

- else if (KnownZero[i])

+ else if (ResolveKnownZeros && KnownZero[i])

Mask[i] = SM_SentinelZero;

}

@@ -8306,7 +8574,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

// TODO: If multiple splats are generated to load the same constant,

// it may be detrimental to overall size. There needs to be a way to detect

// that condition to know if this is truly a size win.

- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool OptForSize = DAG.shouldOptForSize();

// Handle broadcasting a single constant scalar from the constant pool

// into a vector.

@@ -8552,7 +8820,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

} else {

- MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));

+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

DstVec = DAG.getBitcast(VecVT, Imm);

@@ -10130,13 +10398,18 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) {

return true;

}

-/// Test whether there are elements crossing 128-bit lanes in this

+/// Test whether there are elements crossing LaneSizeInBits lanes in this

/// shuffle mask.

///

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

/// and we routinely test for these.

-static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

- int LaneSize = 128 / VT.getScalarSizeInBits();

+static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

+ unsigned ScalarSizeInBits,

+ ArrayRef<int> Mask) {

+ assert(LaneSizeInBits && ScalarSizeInBits &&

+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&

+ "Illegal shuffle lane size");

+ int LaneSize = LaneSizeInBits / ScalarSizeInBits;

int Size = Mask.size();

for (int i = 0; i < Size; ++i)

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

@@ -10144,6 +10417,12 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

return false;

}

+/// Test whether there are elements crossing 128-bit lanes in this

+/// shuffle mask.

+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

+ return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

/// Test whether a shuffle mask is equivalent within each sub-lane.

///

/// This checks a shuffle mask to see if it is performing the same

@@ -10424,84 +10703,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

}

-/// Compute whether each element of a shuffle is zeroable.

-///

-/// A "zeroable" vector shuffle element is one which can be lowered to zero.

-/// Either it is an undef element in the shuffle mask, the element of the input

-/// referenced is undef, or the element of the input referenced is known to be

-/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

-/// as many lanes with this technique as possible to simplify the remaining

-/// shuffle.

-static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,

- SDValue V1, SDValue V2) {

- APInt Zeroable(Mask.size(), 0);

- V1 = peekThroughBitcasts(V1);

- V2 = peekThroughBitcasts(V2);

- bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

- bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

- int VectorSizeInBits = V1.getValueSizeInBits();

- int ScalarSizeInBits = VectorSizeInBits / Mask.size();

- assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

- for (int i = 0, Size = Mask.size(); i < Size; ++i) {

- int M = Mask[i];

- // Handle the easy cases.

- if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

- Zeroable.setBit(i);

- continue;

- }

- // Determine shuffle input and normalize the mask.

- SDValue V = M < Size ? V1 : V2;

- M %= Size;

- // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

- if (V.getOpcode() != ISD::BUILD_VECTOR)

- continue;

- // If the BUILD_VECTOR has fewer elements then the bitcasted portion of

- // the (larger) source element must be UNDEF/ZERO.

- if ((Size % V.getNumOperands()) == 0) {

- int Scale = Size / V->getNumOperands();

- SDValue Op = V.getOperand(M / Scale);

- if (Op.isUndef() || X86::isZeroNode(Op))

- Zeroable.setBit(i);

- else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

- APInt Val = Cst->getAPIntValue();

- Val.lshrInPlace((M % Scale) * ScalarSizeInBits);

- Val = Val.getLoBits(ScalarSizeInBits);

- if (Val == 0)

- Zeroable.setBit(i);

- } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

- APInt Val = Cst->getValueAPF().bitcastToAPInt();

- Val.lshrInPlace((M % Scale) * ScalarSizeInBits);

- Val = Val.getLoBits(ScalarSizeInBits);

- if (Val == 0)

- Zeroable.setBit(i);

- }

- continue;

- }

- // If the BUILD_VECTOR has more elements then all the (smaller) source

- // elements must be UNDEF or ZERO.

- if ((V.getNumOperands() % Size) == 0) {

- int Scale = V->getNumOperands() / Size;

- bool AllZeroable = true;

- for (int j = 0; j < Scale; ++j) {

- SDValue Op = V.getOperand((M * Scale) + j);

- AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));

- }

- if (AllZeroable)

- Zeroable.setBit(i);

- continue;

- }

- return Zeroable;

// The Shuffle result is as follow:

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

// Each Zeroable's element correspond to a particular Mask's element.

@@ -10616,11 +10817,11 @@ static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

}

-static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

- unsigned &UnpackOpcode, bool IsUnary,

- ArrayRef<int> TargetMask,

- const SDLoc &DL, SelectionDAG &DAG,

- const X86Subtarget &Subtarget) {

+static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

+ unsigned &UnpackOpcode, bool IsUnary,

+ ArrayRef<int> TargetMask, const SDLoc &DL,

+ SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

int NumElts = VT.getVectorNumElements();

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

@@ -10728,8 +10929,8 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

return SDValue();

}

-static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,

- int Delta) {

+static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,

+ int Delta) {

int Size = (int)Mask.size();

int Split = Size / Delta;

int TruncatedVectorStart = SwappedOps ? Size : 0;

@@ -10814,8 +11015,8 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,

// The first half/quarter of the mask should refer to every second/fourth

// element of the vector truncated and bitcasted.

- if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&

- !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))

+ if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&

+ !matchShuffleAsVPMOV(Mask, SwappedOps, 4))

return SDValue();

return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);

@@ -10823,11 +11024,10 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,

// X86 has dedicated pack instructions that can handle specific truncation

// operations: PACKSS and PACKUS.

-static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,

- SDValue &V2, unsigned &PackOpcode,

- ArrayRef<int> TargetMask,

- SelectionDAG &DAG,

- const X86Subtarget &Subtarget) {

+static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

+ unsigned &PackOpcode, ArrayRef<int> TargetMask,

+ SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

unsigned NumElts = VT.getVectorNumElements();

unsigned BitSize = VT.getScalarSizeInBits();

MVT PackSVT = MVT::getIntegerVT(BitSize * 2);

@@ -10880,8 +11080,8 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

const X86Subtarget &Subtarget) {

MVT PackVT;

unsigned PackOpcode;

- if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

- Subtarget))

+ if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

+ Subtarget))

return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),

DAG.getBitcast(PackVT, V2));

@@ -10972,10 +11172,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

const X86Subtarget &Subtarget,

SelectionDAG &DAG);

-static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,

- MutableArrayRef<int> Mask,

- const APInt &Zeroable, bool &ForceV1Zero,

- bool &ForceV2Zero, uint64_t &BlendMask) {

+static bool matchShuffleAsBlend(SDValue V1, SDValue V2,

+ MutableArrayRef<int> Mask,

+ const APInt &Zeroable, bool &ForceV1Zero,

+ bool &ForceV2Zero, uint64_t &BlendMask) {

bool V1IsZeroOrUndef =

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

bool V2IsZeroOrUndef =

@@ -11038,8 +11238,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

uint64_t BlendMask = 0;

bool ForceV1Zero = false, ForceV2Zero = false;

SmallVector<int, 64> Mask(Original.begin(), Original.end());

- if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

- BlendMask))

+ if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

+ BlendMask))

return SDValue();

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

@@ -11161,7 +11361,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

case MVT::v32i16:

case MVT::v64i8: {

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool OptForSize = DAG.shouldOptForSize();

if (!OptForSize) {

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

Subtarget, DAG))

@@ -11609,9 +11809,11 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,

}

/// Try to lower a vector shuffle as a byte shift sequence.

-static SDValue lowerVectorShuffleAsByteShiftMask(

- const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

- const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

+static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

+ SDValue V2, ArrayRef<int> Mask,

+ const APInt &Zeroable,

+ const X86Subtarget &Subtarget,

+ SelectionDAG &DAG) {

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

assert(VT.is128BitVector() && "Only 128-bit vectors supported");

@@ -14056,8 +14258,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return BitBlend;

// Try to use byte shift instructions to mask.

- if (SDValue V = lowerVectorShuffleAsByteShiftMask(

- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

+ Zeroable, Subtarget, DAG))

return V;

// Try to lower by permuting the inputs into an unpack instruction.

@@ -14318,8 +14520,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return V;

// Try to use byte shift instructions to mask.

- if (SDValue V = lowerVectorShuffleAsByteShiftMask(

- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

+ Zeroable, Subtarget, DAG))

return V;

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

@@ -14686,6 +14888,36 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

DAG);

}

+// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

+// TODO: Extend to support v8f32 (+ 512-bit shuffles).

+static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

+ SDValue V1, SDValue V2,

+ ArrayRef<int> Mask,

+ SelectionDAG &DAG) {

+ assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");

+ int LHSMask[4] = {-1, -1, -1, -1};

+ int RHSMask[4] = {-1, -1, -1, -1};

+ unsigned SHUFPMask = 0;

+ // As SHUFPD uses a single LHS/RHS element per lane, we can always

+ // perform the shuffle once the lanes have been shuffled in place.

+ for (int i = 0; i != 4; ++i) {

+ int M = Mask[i];

+ if (M < 0)

+ continue;

+ int LaneBase = i & ~1;

+ auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

+ LaneMask[LaneBase + (M & 1)] = M;

+ SHUFPMask |= (M & 1) << i;

+ }

+ SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

+ SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

+ DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

/// Lower a vector shuffle crossing multiple 128-bit lanes as

/// a lane permutation followed by a per-lane permutation.

///

@@ -14764,13 +14996,22 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(

int Size = Mask.size();

int LaneSize = Size / 2;

+ // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

+ // Only do this if the elements aren't all from the lower lane,

+ // otherwise we're (probably) better off doing a split.

+ if (VT == MVT::v4f64 &&

+ !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

+ if (SDValue V =

+ lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))

+ return V;

// If there are only inputs from one 128-bit lane, splitting will in fact be

// less expensive. The flags track whether the given lane contains an element

// that crosses to another lane.

if (!Subtarget.hasAVX2()) {

bool LaneCrossing[2] = {false, false};

for (int i = 0; i < Size; ++i)

- if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

+ if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

if (!LaneCrossing[0] || !LaneCrossing[1])

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

@@ -14778,7 +15019,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(

bool LaneUsed[2] = {false, false};

for (int i = 0; i < Size; ++i)

if (Mask[i] >= 0)

- LaneUsed[(Mask[i] / LaneSize)] = true;

+ LaneUsed[(Mask[i] % Size) / LaneSize] = true;

if (!LaneUsed[0] || !LaneUsed[1])

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

}

@@ -14817,8 +15058,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

if (Subtarget.hasAVX2() && V2.isUndef())

return SDValue();

+ bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

SmallVector<int, 4> WidenedMask;

- if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))

+ if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

return SDValue();

bool IsLowZero = (Zeroable & 0x3) == 0x3;

@@ -15637,6 +15880,18 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Zeroable, Subtarget, DAG))

return Op;

+ // If we have lane crossing shuffles AND they don't all come from the lower

+ // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

+ // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

+ // canonicalize to a blend of splat which isn't necessary for this combine.

+ if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

+ !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

+ (V1.getOpcode() != ISD::BUILD_VECTOR) &&

+ (V2.getOpcode() != ISD::BUILD_VECTOR))

+ if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,

+ Mask, DAG))

+ return Op;

// If we have one input in place, then we can permute the other input and

// blend the result.

if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))

@@ -16950,6 +17205,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

break;

case MVT::v64i1:

+ // Fall back to scalarization. FIXME: We can do better if the shuffle

+ // can be partitioned cleanly.

+ if (!Subtarget.useBWIRegs())

+ return SDValue();

ExtVT = MVT::v64i8;

break;

}

@@ -17039,8 +17298,8 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

/// above in helper routines. The canonicalization attempts to widen shuffles

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

/// s.t. only one of the two inputs needs to be tested, etc.

-static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,

- SelectionDAG &DAG) {

+static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

+ SelectionDAG &DAG) {

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

ArrayRef<int> OrigMask = SVOp->getMask();

SDValue V1 = Op.getOperand(0);

@@ -17086,29 +17345,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,

// We actually see shuffles that are entirely re-arrangements of a set of

// zero inputs. This mostly happens while decomposing complex shuffles into

// simple ones. Directly lower these as a buildvector of zeros.

- APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);

+ APInt KnownUndef, KnownZero;

+ computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

+ APInt Zeroable = KnownUndef | KnownZero;

if (Zeroable.isAllOnesValue())

return getZeroVector(VT, Subtarget, DAG, DL);

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

- // Create an alternative mask with info about zeroable elements.

- // Here we do not set undef elements as zeroable.

- SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());

- if (V2IsZero) {

- assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");

- for (int i = 0; i != NumElements; ++i)

- if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])

- ZeroableMask[i] = SM_SentinelZero;

- }

// Try to collapse shuffles into using a vector type with fewer elements but

// wider element types. We cap this to not form integers or floating point

// elements wider than 64 bits, but it might be interesting to form i128

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

SmallVector<int, 16> WidenedMask;

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

- canWidenShuffleElements(ZeroableMask, WidenedMask)) {

+ canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

// Shuffle mask widening should not interfere with a broadcast opportunity

// by obfuscating the operands with bitcasts.

// TODO: Avoid lowering directly from this top-level function: make this

@@ -18307,7 +18559,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

"Unexpected funnel shift type!");

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool OptForSize = DAG.shouldOptForSize();

if (!OptForSize && Subtarget.isSHLDSlow())

return SDValue();

@@ -18328,8 +18580,13 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

assert((Op.getOpcode() == ISD::SINT_TO_FP ||

- Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");

- SDValue Src = Op.getOperand(0);

+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||

+ Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||

+ Op.getOpcode() == ISD::UINT_TO_FP) &&

+ "Unexpected opcode!");

+ bool IsStrict = Op->isStrictFPOpcode();

+ unsigned OpNo = IsStrict ? 1 : 0;

+ SDValue Src = Op.getOperand(OpNo);

MVT SrcVT = Src.getSimpleValueType();

MVT VT = Op.getSimpleValueType();

@@ -18346,7 +18603,17 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

SDLoc dl(Op);

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

+ if (IsStrict) {

+ SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

+ {Op.getOperand(0), InVec});

+ SDValue Chain = CvtVec.getValue(1);

+ SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

+ DAG.getIntPtrConstant(0, dl));

+ return DAG.getMergeValues({Value, Chain}, dl);

+ }

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

DAG.getIntPtrConstant(0, dl));

}

@@ -18415,44 +18682,157 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

DAG.getIntPtrConstant(0, DL));

}

+static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ SDLoc DL(Op);

+ bool IsStrict = Op->isStrictFPOpcode();

+ MVT VT = Op->getSimpleValueType(0);

+ SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

+ if (Subtarget.hasDQI()) {

+ assert(!Subtarget.hasVLX() && "Unexpected features");

+ assert((Src.getSimpleValueType() == MVT::v2i64 ||

+ Src.getSimpleValueType() == MVT::v4i64) &&

+ "Unsupported custom type");

+ // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

+ assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&

+ "Unexpected VT!");

+ MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

+ // Need to concat with zero vector for strict fp to avoid spurious

+ // exceptions.

+ SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

+ : DAG.getUNDEF(MVT::v8i64);

+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

+ DAG.getIntPtrConstant(0, DL));

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

+ {Op->getOperand(0), Src});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

+ }

+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

+ DAG.getIntPtrConstant(0, DL));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, DL);

+ return Res;

+ }

+ bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

+ Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

+ if (VT != MVT::v4f32 || IsSigned)

+ return SDValue();

+ SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

+ SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

+ SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

+ DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

+ DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

+ SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

+ SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

+ SmallVector<SDValue, 4> SignCvts(4);

+ SmallVector<SDValue, 4> Chains(4);

+ for (int i = 0; i != 4; ++i) {

+ SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

+ DAG.getIntPtrConstant(i, DL));

+ if (IsStrict) {

+ SignCvts[i] =

+ DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

+ {Op.getOperand(0), Src});

+ Chains[i] = SignCvts[i].getValue(1);

+ } else {

+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);

+ }

+ SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

+ SDValue Slow, Chain;

+ if (IsStrict) {

+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

+ Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

+ {Chain, SignCvt, SignCvt});

+ Chain = Slow.getValue(1);

+ } else {

+ Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

+ }

+ IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

+ SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

+ if (IsStrict)

+ return DAG.getMergeValues({Cvt, Chain}, DL);

+ return Cvt;

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

SelectionDAG &DAG) const {

- SDValue Src = Op.getOperand(0);

+ bool IsStrict = Op->isStrictFPOpcode();

+ unsigned OpNo = IsStrict ? 1 : 0;

+ SDValue Src = Op.getOperand(OpNo);

+ SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

MVT SrcVT = Src.getSimpleValueType();

MVT VT = Op.getSimpleValueType();

SDLoc dl(Op);

- if (VT == MVT::f128)

- return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

return Extract;

if (SrcVT.isVector()) {

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

+ // Note: Since v2f64 is a legal type. We don't need to zero extend the

+ // source for strict FP.

+ if (IsStrict)

+ return DAG.getNode(

+ X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

+ {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

+ DAG.getUNDEF(SrcVT))});

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

DAG.getUNDEF(SrcVT)));

}

+ if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

return SDValue();

}

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&

"Unknown SINT_TO_FP to lower!");

+ bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

// These are really Legal; return the operand so the caller accepts it as

// Legal.

- if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))

+ if (SrcVT == MVT::i32 && UseSSEReg)

return Op;

- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())

+ if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

return Op;

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

return V;

- SDValue ValueToStore = Op.getOperand(0);

- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&

- !Subtarget.is64Bit())

+ // SSE doesn't have an i16 conversion so we need to promote.

+ if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

+ {Chain, Ext});

+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

+ }

+ if (VT == MVT::f128)

+ return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

+ SDValue ValueToStore = Src;

+ if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())

// Bitcasting to f64 here allows us to do a single 64-bit store from

// an SSE register, avoiding the store forwarding penalty that would come

// with two 32-bit stores.

@@ -18463,13 +18843,18 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

auto PtrVT = getPointerTy(MF.getDataLayout());

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

- SDValue Chain = DAG.getStore(

- DAG.getEntryNode(), dl, ValueToStore, StackSlot,

+ Chain = DAG.getStore(

+ Chain, dl, ValueToStore, StackSlot,

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

- return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);

+ std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);

+ if (IsStrict)

+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

+ return Tmp.first;

}

-SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

SDValue StackSlot,

SelectionDAG &DAG) const {

// Build the FILD

@@ -18498,9 +18883,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

SDValue Result =

DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,

Tys, FILDOps, SrcVT, LoadMMO);

+ Chain = Result.getValue(1);

if (useSSE) {

- Chain = Result.getValue(1);

SDValue InFlag = Result.getValue(2);

// FIXME: Currently the FST is glued to the FILD_FLAG. This

@@ -18522,9 +18907,10 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

Result = DAG.getLoad(

Op.getValueType(), DL, Chain, StackSlot,

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

+ Chain = Result.getValue(1);

}

- return Result;

+ return { Result, Chain };

}

/// Horizontal vector math instructions may be slower than normal math with

@@ -18532,7 +18918,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

/// implementation, and likely shuffle complexity of the alternate sequence.

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool IsOptimizingSize = DAG.shouldOptForSize();

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

}

@@ -18553,6 +18939,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

#endif

+ bool IsStrict = Op->isStrictFPOpcode();

+ unsigned OpNo = IsStrict ? 1 : 0;

SDLoc dl(Op);

LLVMContext *Context = DAG.getContext();

@@ -18573,8 +18961,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

// Load the 64-bit value into an XMM register.

- SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

- Op.getOperand(0));

+ SDValue XR1 =

+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));

SDValue CLod0 =

DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

@@ -18587,51 +18975,81 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

/* Alignment = */ 16);

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

+ SDValue Sub;

+ SDValue Chain;

// TODO: Are there any fast-math-flags to propagate here?

- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

+ if (IsStrict) {

+ Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

+ {Op.getOperand(0), XR2F, CLod1});

+ Chain = Sub.getValue(1);

+ } else

+ Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

SDValue Result;

- if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {

+ if (!IsStrict && Subtarget.hasSSE3() &&

+ shouldUseHorizontalOp(true, DAG, Subtarget)) {

+ // FIXME: Do we need a STRICT version of FHADD?

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

} else {

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

- Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

+ if (IsStrict) {

+ Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},

+ {Chain, Shuffle, Sub});

+ Chain = Result.getValue(1);

+ } else

+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

}

+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Result, Chain}, dl);

- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

- DAG.getIntPtrConstant(0, dl));

+ return Result;

}

/// 32-bit unsigned integer to float expansion.

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

SDLoc dl(Op);

// FP constant to bias correct the final result.

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,

MVT::f64);

// Load the 32-bit value into an XMM register.

- SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,

- Op.getOperand(0));

+ SDValue Load =

+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

// Zero out the upper parts of the register.

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

- Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

- DAG.getBitcast(MVT::v2f64, Load),

- DAG.getIntPtrConstant(0, dl));

// Or the load with the bias.

SDValue Or = DAG.getNode(

ISD::OR, dl, MVT::v2i64,

- DAG.getBitcast(MVT::v2i64,

- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),

+ DAG.getBitcast(MVT::v2i64, Load),

DAG.getBitcast(MVT::v2i64,

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

Or =

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

+ if (Op.getNode()->isStrictFPOpcode()) {

+ // Subtract the bias.

+ // TODO: Are there any fast-math-flags to propagate here?

+ SDValue Chain = Op.getOperand(0);

+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

+ {Chain, Or, Bias});

+ if (Op.getValueType() == Sub.getValueType())

+ return Sub;

+ // Handle final rounding.

+ std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

+ Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

+ return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

+ }

// Subtract the bias.

// TODO: Are there any fast-math-flags to propagate here?

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

@@ -18646,38 +19064,123 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

if (Op.getSimpleValueType() != MVT::v2f64)

return SDValue();

- SDValue N0 = Op.getOperand(0);

+ bool IsStrict = Op->isStrictFPOpcode();

+ SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

- // Legalize to v4i32 type.

- N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

- DAG.getUNDEF(MVT::v2i32));

+ if (Subtarget.hasAVX512()) {

+ if (!Subtarget.hasVLX()) {

+ // Let generic type legalization widen this.

+ if (!IsStrict)

+ return SDValue();

+ // Otherwise pad the integer input with 0s and widen the operation.

+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

+ DAG.getConstant(0, DL, MVT::v2i32));

+ SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

+ {Op.getOperand(0), N0});

+ SDValue Chain = Res.getValue(1);

+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

+ DAG.getIntPtrConstant(0, DL));

+ return DAG.getMergeValues({Res, Chain}, DL);

+ }

- if (Subtarget.hasAVX512())

+ // Legalize to v4i32 type.

+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

+ DAG.getUNDEF(MVT::v2i32));

+ if (IsStrict)

+ return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

+ {Op.getOperand(0), N0});

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

+ }

- // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,

- // but using v2i32 to v2f64 with X86ISD::CVTSI2P.

- SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);

- SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

- // Two to the power of half-word-size.

- SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);

- // Clear upper part of LO, lower HI.

- SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);

- SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

- SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);

- fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);

- SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

+ // Zero extend to 2i64, OR with the floating point representation of 2^52.

+ // This gives us the floating point equivalent of 2^52 + the i32 integer

+ // since double has 52-bits of mantissa. Then subtract 2^52 in floating

+ // point leaving just our i32 integers in double format.

+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

+ SDValue VBias =

+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);

+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

+ DAG.getBitcast(MVT::v2i64, VBias));

+ Or = DAG.getBitcast(MVT::v2f64, Or);

- // Add the two halves.

- return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

+ {Op.getOperand(0), Or, VBias});

+ return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

}

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

+ SDLoc DL(Op);

+ bool IsStrict = Op->isStrictFPOpcode();

+ SDValue V = Op->getOperand(IsStrict ? 1 : 0);

+ MVT VecIntVT = V.getSimpleValueType();

+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&

+ "Unsupported custom type");

+ if (Subtarget.hasAVX512()) {

+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.

+ assert(!Subtarget.hasVLX() && "Unexpected features");

+ MVT VT = Op->getSimpleValueType(0);

+ // v8i32->v8f64 is legal with AVX512 so just return it.

+ if (VT == MVT::v8f64)

+ return Op;

+ assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&

+ "Unexpected VT!");

+ MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

+ MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

+ // Need to concat with zero vector for strict fp to avoid spurious

+ // exceptions.

+ SDValue Tmp =

+ IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

+ V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

+ DAG.getIntPtrConstant(0, DL));

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

+ {Op->getOperand(0), V});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

+ }

+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

+ DAG.getIntPtrConstant(0, DL));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, DL);

+ return Res;

+ }

+ if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

+ Op->getSimpleValueType(0) == MVT::v4f64) {

+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

+ Constant *Bias = ConstantFP::get(

+ *DAG.getContext(),

+ APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

+ auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);

+ SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

+ SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

+ SDValue VBias = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

+ /*Alignment*/ 8, MachineMemOperand::MOLoad);

+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

+ DAG.getBitcast(MVT::v4i64, VBias));

+ Or = DAG.getBitcast(MVT::v4f64, Or);

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

+ {Op.getOperand(0), Or, VBias});

+ return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

+ }

// The algorithm is the following:

// #ifdef __SSE4_1__

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

@@ -18690,18 +19193,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

// return (float4) lo + fhi;

- // We shouldn't use it when unsafe-fp-math is enabled though: we might later

- // reassociate the two FADDs, and if we do that, the algorithm fails

- // spectacularly (PR24512).

- // FIXME: If we ever have some kind of Machine FMF, this should be marked

- // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because

- // there's also the MachineCombiner reassociations happening on Machine IR.

- if (DAG.getTarget().Options.UnsafeFPMath)

- return SDValue();

- SDLoc DL(Op);

- SDValue V = Op->getOperand(0);

- MVT VecIntVT = V.getSimpleValueType();

bool Is128 = VecIntVT == MVT::v4i32;

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

// If we convert to something else than the supported type, e.g., to v4f64,

@@ -18709,9 +19200,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

if (VecFloatVT != Op->getSimpleValueType(0))

return SDValue();

- assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&

- "Unsupported custom type");

// In the #idef/#else code, we have in common:

// - The vector of constants:

// -- 0x4b000000

@@ -18756,23 +19244,35 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

}

- // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).

- SDValue VecCstFAdd = DAG.getConstantFP(

- APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

+ // Create the vector constant for (0x1.0p39f + 0x1.0p23f).

+ SDValue VecCstFSub = DAG.getConstantFP(

+ APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

+ // NOTE: By using fsub of a positive constant instead of fadd of a negative

+ // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

+ // enabled. See PR24512.

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

// TODO: Are there any fast-math-flags to propagate here?

- SDValue FHigh =

- DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);

- // return (float4) lo + fhi;

+ // (float4) lo;

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

+ // return (float4) lo + fhi;

+ if (IsStrict) {

+ SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

+ {Op.getOperand(0), HighBitcast, VecCstFSub});

+ return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

+ {FHigh.getValue(1), LowBitcast, FHigh});

+ }

+ SDValue FHigh =

+ DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

}

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

- SDValue N0 = Op.getOperand(0);

+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

+ SDValue N0 = Op.getOperand(OpNo);

MVT SrcVT = N0.getSimpleValueType();

SDLoc dl(Op);

@@ -18783,18 +19283,23 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

case MVT::v4i32:

case MVT::v8i32:

- assert(!Subtarget.hasAVX512());

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

+ case MVT::v2i64:

+ case MVT::v4i64:

+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

}

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

SelectionDAG &DAG) const {

- SDValue N0 = Op.getOperand(0);

+ bool IsStrict = Op->isStrictFPOpcode();

+ unsigned OpNo = IsStrict ? 1 : 0;

+ SDValue Src = Op.getOperand(OpNo);

SDLoc dl(Op);

auto PtrVT = getPointerTy(DAG.getDataLayout());

- MVT SrcVT = N0.getSimpleValueType();

- MVT DstVT = Op.getSimpleValueType();

+ MVT SrcVT = Src.getSimpleValueType();

+ MVT DstVT = Op->getSimpleValueType(0);

+ SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

if (DstVT == MVT::f128)

return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));

@@ -18814,8 +19319,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

- N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);

- return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);

+ Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

+ {Chain, Src});

+ return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

}

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

@@ -18823,7 +19331,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

- if (SrcVT == MVT::i32 && X86ScalarSSEf64)

+ if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)

return SDValue();

@@ -18832,23 +19340,28 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);

if (SrcVT == MVT::i32) {

SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);

- SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),

- StackSlot, MachinePointerInfo());

+ SDValue Store1 =

+ DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

OffsetSlot, MachinePointerInfo());

- SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);

- return Fild;

+ std::pair<SDValue, SDValue> Tmp =

+ BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);

+ if (IsStrict)

+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

+ return Tmp.first;

}

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");

- SDValue ValueToStore = Op.getOperand(0);

- if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())

+ SDValue ValueToStore = Src;

+ if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

// Bitcasting to f64 here allows us to do a single 64-bit store from

// an SSE register, avoiding the store forwarding penalty that would come

// with two 32-bit stores.

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,

- MachinePointerInfo());

+ }

+ SDValue Store =

+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());

// For i64 source, we need to add the appropriate power of 2 if the input

// was negative. This is the same as the optimization in

// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,

@@ -18863,32 +19376,42 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

SDValue Ops[] = { Store, StackSlot };

SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,

MVT::i64, MMO);

+ Chain = Fild.getValue(1);

- APInt FF(32, 0x5F800000ULL);

// Check whether the sign bit is set.

SDValue SignSet = DAG.getSetCC(

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

- Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

+ Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

- // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.

+ // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

+ APInt FF(64, 0x5F80000000000000ULL);

SDValue FudgePtr = DAG.getConstantPool(

- ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

+ ConstantInt::get(*DAG.getContext(), FF), PtrVT);

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

SDValue Zero = DAG.getIntPtrConstant(0, dl);

SDValue Four = DAG.getIntPtrConstant(4, dl);

- SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);

+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

// Load the value out, extending it from f32 to f80.

- // FIXME: Avoid the extend by constructing the right constant pool?

SDValue Fudge = DAG.getExtLoad(

- ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,

+ ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

/* Alignment = */ 4);

+ Chain = Fudge.getValue(1);

// Extend everything to 80 bits to force it to be done on x87.

// TODO: Are there any fast-math-flags to propagate here?

+ if (IsStrict) {

+ SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},

+ {Chain, Fild, Fudge});

+ // STRICT_FP_ROUND can't handle equal types.

+ if (DstVT == MVT::f80)

+ return Add;

+ return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

+ {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

+ }

SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

DAG.getIntPtrConstant(0, dl));

@@ -18902,11 +19425,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

// result.

SDValue

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

- bool IsSigned) const {

+ bool IsSigned, SDValue &Chain) const {

+ bool IsStrict = Op->isStrictFPOpcode();

SDLoc DL(Op);

EVT DstTy = Op.getValueType();

- EVT TheVT = Op.getOperand(0).getValueType();

+ SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

+ EVT TheVT = Value.getValueType();

auto PtrVT = getPointerTy(DAG.getDataLayout());

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

@@ -18920,6 +19445,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

+ // FIXME: This does not generate an invalid exception if the input does not

+ // fit in i32. PR44019

if (!IsSigned && DstTy != MVT::i64) {

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

// The low 32 bits of the fist result will have the correct uint32 result.

@@ -18938,8 +19465,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

- SDValue Chain = DAG.getEntryNode();

- SDValue Value = Op.getOperand(0);

+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

if (UnsignedFixup) {

@@ -18949,8 +19476,9 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

// of a signed i64. Let Thresh be the FP equivalent of

// 0x8000000000000000ULL.

- // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;

- // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);

+ // Adjust = (Value < Thresh) ? 0 : 0x80000000;

+ // FltOfs = (Value < Thresh) ? 0 : 0x80000000;

+ // FistSrc = (Value - FltOfs);

// Fist-to-mem64 FistSrc

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

// to XOR'ing the high 32 bits with Adjust.

@@ -18975,19 +19503,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

- SDValue Cmp = DAG.getSetCC(DL,

- getSetCCResultType(DAG.getDataLayout(),

- *DAG.getContext(), TheVT),

- Value, ThreshVal, ISD::SETLT);

+ EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

+ *DAG.getContext(), TheVT);

+ SDValue Cmp;

+ if (IsStrict) {

+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,

+ Chain, /*IsSignaling*/ true);

+ Chain = Cmp.getValue(1);

+ } else {

+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);

+ }

Adjust = DAG.getSelect(DL, MVT::i64, Cmp,

DAG.getConstant(0, DL, MVT::i64),

DAG.getConstant(APInt::getSignMask(64),

DL, MVT::i64));

- SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);

- Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),

- *DAG.getContext(), TheVT),

- Value, ThreshVal, ISD::SETLT);

- Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);

+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,

+ DAG.getConstantFP(0.0, DL, TheVT),

+ ThreshVal);

+ if (IsStrict) {

+ Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

+ { Chain, Value, FltOfs });

+ Chain = Value.getValue(1);

+ } else

+ Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

}

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

@@ -19017,6 +19557,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

Ops, DstTy, MMO);

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

+ Chain = Res.getValue(1);

// If we need an unsigned fixup, XOR the result with adjust.

if (UnsignedFixup)

@@ -19036,7 +19577,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

assert(VT.isVector() && InVT.isVector() && "Expected vector type");

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&

"Unexpected extension opcode");

- assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&

+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&

"Expected same number of elements");

assert((VT.getVectorElementType() == MVT::i16 ||

VT.getVectorElementType() == MVT::i32 ||

@@ -19512,48 +20053,137 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

}

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

- bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;

- MVT VT = Op.getSimpleValueType();

- SDValue Src = Op.getOperand(0);

+ bool IsStrict = Op->isStrictFPOpcode();

+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

+ MVT VT = Op->getSimpleValueType(0);

+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

MVT SrcVT = Src.getSimpleValueType();

SDLoc dl(Op);

- if (SrcVT == MVT::f128) {

- RTLIB::Libcall LC;

- if (Op.getOpcode() == ISD::FP_TO_SINT)

- LC = RTLIB::getFPTOSINT(SrcVT, VT);

- else

- LC = RTLIB::getFPTOUINT(SrcVT, VT);

- MakeLibCallOptions CallOptions;

- return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;

- }

if (VT.isVector()) {

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

MVT ResVT = MVT::v4i32;

MVT TruncVT = MVT::v4i1;

- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

+ unsigned Opc;

+ if (IsStrict)

+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

+ else

+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

if (!IsSigned && !Subtarget.hasVLX()) {

+ assert(Subtarget.useAVX512Regs() && "Unexpected features!");

// Widen to 512-bits.

ResVT = MVT::v8i32;

TruncVT = MVT::v8i1;

- Opc = ISD::FP_TO_UINT;

- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,

- DAG.getUNDEF(MVT::v8f64),

- Src, DAG.getIntPtrConstant(0, dl));

+ Opc = Op.getOpcode();

+ // Need to concat with zero vector for strict fp to avoid spurious

+ // exceptions.

+ // TODO: Should we just do this for non-strict as well?

+ SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

+ : DAG.getUNDEF(MVT::v8f64);

+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

+ DAG.getIntPtrConstant(0, dl));

+ }

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res =

+ DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(Opc, dl, ResVT, Src);

}

- SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

- DAG.getIntPtrConstant(0, dl));

+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, dl);

+ return Res;

+ }

+ // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

+ if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

+ assert(!IsSigned && "Expected unsigned conversion!");

+ assert(Subtarget.useAVX512Regs() && "Requires avx512f");

+ return Op;

+ }

+ // Widen vXi32 fp_to_uint with avx512f to 512-bit source.

+ if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

+ (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {

+ assert(!IsSigned && "Expected unsigned conversion!");

+ assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&

+ "Unexpected features!");

+ MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

+ MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

+ // Need to concat with zero vector for strict fp to avoid spurious

+ // exceptions.

+ // TODO: Should we just do this for non-strict as well?

+ SDValue Tmp =

+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

+ DAG.getIntPtrConstant(0, dl));

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

+ {Op->getOperand(0), Src});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

+ }

+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, dl);

+ return Res;

+ }

+ // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

+ if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

+ (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {

+ assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&

+ !Subtarget.hasVLX() && "Unexpected features!");

+ MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

+ // Need to concat with zero vector for strict fp to avoid spurious

+ // exceptions.

+ // TODO: Should we just do this for non-strict as well?

+ SDValue Tmp =

+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

+ DAG.getIntPtrConstant(0, dl));

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

+ {Op->getOperand(0), Src});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

+ }

+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, dl);

+ return Res;

}

- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

- return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,

- DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

- DAG.getUNDEF(MVT::v2f32)));

+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");

+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

+ DAG.getUNDEF(MVT::v2f32));

+ if (IsStrict) {

+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

+ : X86ISD::STRICT_CVTTP2UI;

+ return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

+ }

+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

+ return DAG.getNode(Opc, dl, VT, Tmp);

}

return SDValue();

@@ -19575,9 +20205,21 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

assert(VT == MVT::i32 && "Unexpected VT!");

// Promote i32 to i64 and use a signed operation on 64-bit targets.

+ // FIXME: This does not generate an invalid exception if the input does not

+ // fit in i32. PR44019

if (Subtarget.is64Bit()) {

- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},

+ { Op.getOperand(0), Src });

+ Chain = Res.getValue(1);

+ } else

+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

+ if (IsStrict)

+ return DAG.getMergeValues({ Res, Chain }, dl);

+ return Res;

}

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

@@ -19586,28 +20228,65 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

return SDValue();

}

- // Promote i16 to i32 if we can use a SSE operation.

- if (VT == MVT::i16 && UseSSEReg) {

+ // Promote i16 to i32 if we can use a SSE operation or the type is f128.

+ // FIXME: This does not generate an invalid exception if the input does not

+ // fit in i16. PR44019

+ if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");

- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},

+ { Op.getOperand(0), Src });

+ Chain = Res.getValue(1);

+ } else

+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

+ if (IsStrict)

+ return DAG.getMergeValues({ Res, Chain }, dl);

+ return Res;

}

- // If this is a SINT_TO_FP using SSEReg we're done.

+ // If this is a FP_TO_SINT using SSEReg we're done.

if (UseSSEReg && IsSigned)

return Op;

+ // fp128 needs to use a libcall.

+ if (SrcVT == MVT::f128) {

+ RTLIB::Libcall LC;

+ if (IsSigned)

+ LC = RTLIB::getFPTOSINT(SrcVT, VT);

+ else

+ LC = RTLIB::getFPTOUINT(SrcVT, VT);

+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

+ MakeLibCallOptions CallOptions;

+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

+ SDLoc(Op), Chain);

+ if (IsStrict)

+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

+ return Tmp.first;

+ }

// Fall back to X87.

- if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))

+ SDValue Chain;

+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

+ if (IsStrict)

+ return DAG.getMergeValues({V, Chain}, dl);

return V;

+ }

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");

}

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

+ bool IsStrict = Op->isStrictFPOpcode();

SDLoc DL(Op);

MVT VT = Op.getSimpleValueType();

- SDValue In = Op.getOperand(0);

+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);

MVT SVT = In.getSimpleValueType();

if (VT == MVT::f128) {

@@ -19617,14 +20296,19 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

- return DAG.getNode(X86ISD::VFPEXT, DL, VT,

- DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,

- In, DAG.getUNDEF(SVT)));

+ SDValue Res =

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

+ if (IsStrict)

+ return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

+ {Op->getOperand(0), Res});

+ return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

}

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

+ bool IsStrict = Op->isStrictFPOpcode();

MVT VT = Op.getSimpleValueType();

- SDValue In = Op.getOperand(0);

+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);

MVT SVT = In.getSimpleValueType();

// It's legal except when f128 is involved

@@ -19636,17 +20320,17 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

// FP_ROUND node has a second operand indicating whether it is known to be

// precise. That doesn't take part in the LibCall so we can't directly use

// LowerF128Call.

+ SDLoc dl(Op);

+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

MakeLibCallOptions CallOptions;

- return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;

+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,

+ dl, Chain);

-// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking

-// the default expansion of STRICT_FP_ROUND.

-static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {

- // FIXME: Need to form a libcall with an input chain for f128.

- assert(Op.getOperand(0).getValueType() != MVT::f128 &&

- "Don't know how to handle f128 yet!");

- return Op;

+ if (IsStrict)

+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

+ return Tmp.first;

}

/// Depending on uarch and/or optimizing for size, we might prefer to use a

@@ -19724,12 +20408,6 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

/// Depending on uarch and/or optimizing for size, we might prefer to use a

/// vector operation in place of the typical scalar operation.

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

- if (Op.getValueType() == MVT::f128) {

- RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128

- : RTLIB::SUB_F128;

- return LowerF128Call(Op, DAG, LC);

- }

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&

"Only expecting float/double");

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

@@ -20013,6 +20691,19 @@ static bool hasNonFlagsUse(SDValue Op) {

return false;

}

+// Transform to an x86-specific ALU node with flags if there is a chance of

+// using an RMW op or only the flags are used. Otherwise, leave

+// the node alone and emit a 'cmp' or 'test' instruction.

+static bool isProfitableToUseFlagOp(SDValue Op) {

+ for (SDNode *U : Op->uses())

+ if (U->getOpcode() != ISD::CopyToReg &&

+ U->getOpcode() != ISD::SETCC &&

+ U->getOpcode() != ISD::STORE)

+ return false;

+ return true;

/// Emit nodes that will be selected as "test Op0,Op0", or something

/// equivalent.

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

@@ -20076,15 +20767,8 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

case ISD::SUB:

case ISD::OR:

case ISD::XOR:

- // Transform to an x86-specific ALU node with flags if there is a chance of

- // using an RMW op or only the flags are used. Otherwise, leave

- // the node alone and emit a 'test' instruction.

- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),

- UE = Op.getNode()->use_end(); UI != UE; ++UI)

- if (UI->getOpcode() != ISD::CopyToReg &&

- UI->getOpcode() != ISD::SETCC &&

- UI->getOpcode() != ISD::STORE)

- goto default_case;

+ if (!isProfitableToUseFlagOp(Op))

+ break;

// Otherwise use a regular EFLAGS-setting instruction.

switch (ArithOp.getOpcode()) {

@@ -20112,7 +20796,6 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

Op->getOperand(1)).getValue(1);

}

default:

- default_case:

break;

}

@@ -20131,15 +20814,26 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

/// equivalent.

-SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

- const SDLoc &dl, SelectionDAG &DAG) const {

+static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,

+ unsigned X86CC, const SDLoc &dl,

+ SelectionDAG &DAG,

+ const X86Subtarget &Subtarget,

+ SDValue Chain, bool IsSignaling) {

if (isNullConstant(Op1))

- return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

+ return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);

EVT CmpVT = Op0.getValueType();

- if (CmpVT.isFloatingPoint())

- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);

+ if (CmpVT.isFloatingPoint()) {

+ if (Chain) {

+ SDValue Res =

+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

+ return std::make_pair(Res, Res.getValue(1));

+ }

+ return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),

+ SDValue());

+ }

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");

@@ -20154,7 +20848,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

unsigned ExtendOp =

- isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;

+ isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

// For equality comparisons try to use SIGN_EXTEND if the input was

// truncate from something with enough sign bits.

@@ -20178,10 +20872,22 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

}

+ // Try to shrink i64 compares if the input has enough zero bits.

+ // FIXME: Do this for non-constant compares for constant on LHS?

+ if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

+ Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

+ cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

+ DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

+ CmpVT = MVT::i32;

+ Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

+ Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

+ }

// Use SUB instead of CMP to enable CSE between SUB and CMP.

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

- return Sub.getValue(1);

+ return std::make_pair(Sub.getValue(1), SDValue());

}

/// Convert a comparison if required by the subtarget.

@@ -20189,16 +20895,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,

SelectionDAG &DAG) const {

// If the subtarget does not support the FUCOMI instruction, floating-point

// comparisons have to be converted.

- if (Subtarget.hasCMov() ||

- Cmp.getOpcode() != X86ISD::CMP ||

- !Cmp.getOperand(0).getValueType().isFloatingPoint() ||

- !Cmp.getOperand(1).getValueType().isFloatingPoint())

+ bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;

+ bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||

+ Cmp.getOpcode() == X86ISD::STRICT_FCMPS;

+ if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||

+ !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||

+ !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())

return Cmp;

// The instruction selector will select an FUCOM instruction instead of

// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence

// build an SDNode sequence that transfers the result from FPSW into EFLAGS:

- // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))

+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))

SDLoc dl(Cmp);

SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);

SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);

@@ -20399,7 +21108,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,

} else {

// Use BT if the immediate can't be encoded in a TEST instruction or we

// are optimizing for size and the immedaite won't fit in a byte.

- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool OptForSize = DAG.shouldOptForSize();

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

isPowerOf2_64(AndRHSVal)) {

Src = AndLHS;

@@ -20442,7 +21151,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

/// CMPs.

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

- SDValue &Op1) {

+ SDValue &Op1, bool &IsAlwaysSignaling) {

unsigned SSECC;

bool Swap = false;

@@ -20481,6 +21190,22 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

if (Swap)

std::swap(Op0, Op1);

+ switch (SetCCOpcode) {

+ default:

+ IsAlwaysSignaling = true;

+ break;

+ case ISD::SETEQ:

+ case ISD::SETOEQ:

+ case ISD::SETUEQ:

+ case ISD::SETNE:

+ case ISD::SETONE:

+ case ISD::SETUNE:

+ case ISD::SETO:

+ case ISD::SETUO:

+ IsAlwaysSignaling = false;

+ break;

+ }

return SSECC;

}

@@ -20625,12 +21350,14 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

SelectionDAG &DAG) {

- SDValue Op0 = Op.getOperand(0);

- SDValue Op1 = Op.getOperand(1);

- SDValue CC = Op.getOperand(2);

- MVT VT = Op.getSimpleValueType();

+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

+ Op.getOpcode() == ISD::STRICT_FSETCCS;

+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

+ SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

+ MVT VT = Op->getSimpleValueType(0);

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

- bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();

+ bool isFP = Op1.getSimpleValueType().isFloatingPoint();

SDLoc dl(Op);

if (isFP) {

@@ -20639,57 +21366,119 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

assert(EltVT == MVT::f32 || EltVT == MVT::f64);

#endif

+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

unsigned Opc;

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {

assert(VT.getVectorNumElements() <= 16);

- Opc = X86ISD::CMPM;

+ Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

} else {

- Opc = X86ISD::CMPP;

+ Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

// The SSE/AVX packed FP comparison nodes are defined with a

// floating-point vector result that matches the operand type. This allows

// them to work with an SSE1 target (integer vector types are not legal).

VT = Op0.getSimpleValueType();

}

- // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

- // emit two comparisons and a logic op to tie them together.

SDValue Cmp;

- unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);

- if (SSECC >= 8 && !Subtarget.hasAVX()) {

- // LLVM predicate is SETUEQ or SETONE.

- unsigned CC0, CC1;

- unsigned CombineOpc;

- if (Cond == ISD::SETUEQ) {

- CC0 = 3; // UNORD

- CC1 = 0; // EQ

- CombineOpc = X86ISD::FOR;

+ bool IsAlwaysSignaling;

+ unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

+ if (!Subtarget.hasAVX()) {

+ // TODO: We could use following steps to handle a quiet compare with

+ // signaling encodings.

+ // 1. Get ordered masks from a quiet ISD::SETO

+ // 2. Use the masks to mask potential unordered elements in operand A, B

+ // 3. Get the compare results of masked A, B

+ // 4. Calculating final result using the mask and result from 3

+ // But currently, we just fall back to scalar operations.

+ if (IsStrict && IsAlwaysSignaling && !IsSignaling)

+ return SDValue();

+ // Insert an extra signaling instruction to raise exception.

+ if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

+ SDValue SignalCmp = DAG.getNode(

+ Opc, dl, {VT, MVT::Other},

+ {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

+ // FIXME: It seems we need to update the flags of all new strict nodes.

+ // Otherwise, mayRaiseFPException in MI will return false due to

+ // NoFPExcept = false by default. However, I didn't find it in other

+ // patches.

+ SignalCmp->setFlags(Op->getFlags());

+ Chain = SignalCmp.getValue(1);

+ }

+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

+ // emit two comparisons and a logic op to tie them together.

+ if (SSECC >= 8) {

+ // LLVM predicate is SETUEQ or SETONE.

+ unsigned CC0, CC1;

+ unsigned CombineOpc;

+ if (Cond == ISD::SETUEQ) {

+ CC0 = 3; // UNORD

+ CC1 = 0; // EQ

+ CombineOpc = X86ISD::FOR;

+ } else {

+ assert(Cond == ISD::SETONE);

+ CC0 = 7; // ORD

+ CC1 = 4; // NEQ

+ CombineOpc = X86ISD::FAND;

+ }

+ SDValue Cmp0, Cmp1;

+ if (IsStrict) {

+ Cmp0 = DAG.getNode(

+ Opc, dl, {VT, MVT::Other},

+ {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

+ Cmp1 = DAG.getNode(

+ Opc, dl, {VT, MVT::Other},

+ {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

+ Cmp1.getValue(1));

+ } else {

+ Cmp0 = DAG.getNode(

+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

+ Cmp1 = DAG.getNode(

+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

+ }

+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

} else {

- assert(Cond == ISD::SETONE);

- CC0 = 7; // ORD

- CC1 = 4; // NEQ

- CombineOpc = X86ISD::FAND;

+ if (IsStrict) {

+ Cmp = DAG.getNode(

+ Opc, dl, {VT, MVT::Other},

+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

+ Chain = Cmp.getValue(1);

+ } else

+ Cmp = DAG.getNode(

+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

}

- SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,

- DAG.getTargetConstant(CC0, dl, MVT::i8));

- SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,

- DAG.getTargetConstant(CC1, dl, MVT::i8));

- Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

} else {

// Handle all other FP comparisons here.

- Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,

- DAG.getTargetConstant(SSECC, dl, MVT::i8));

+ if (IsStrict) {

+ // Make a flip on already signaling CCs before setting bit 4 of AVX CC.

+ SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

+ Cmp = DAG.getNode(

+ Opc, dl, {VT, MVT::Other},

+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

+ Chain = Cmp.getValue(1);

+ } else

+ Cmp = DAG.getNode(

+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

}

// If this is SSE/AVX CMPP, bitcast the result back to integer to match the

// result type of SETCC. The bitcast is expected to be optimized away

// during combining/isel.

- if (Opc == X86ISD::CMPP)

- Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

+ if (IsStrict)

+ return DAG.getMergeValues({Cmp, Chain}, dl);

return Cmp;

}

+ assert(!IsStrict && "Strict SETCC only handles FP operands.");

MVT VTOp0 = Op0.getSimpleValueType();

(void)VTOp0;

assert(VTOp0 == Op1.getSimpleValueType() &&

@@ -20860,6 +21649,30 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

assert(Subtarget.hasSSE2() && "Don't know how to lower!");

+ // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

+ // the odd elements over the even elements.

+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

+ Op0 = DAG.getConstant(0, dl, MVT::v4i32);

+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);

+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

+ static const int MaskHi[] = { 1, 1, 3, 3 };

+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

+ return DAG.getBitcast(VT, Result);

+ }

+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);

+ Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

+ static const int MaskHi[] = { 1, 1, 3, 3 };

+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

+ return DAG.getBitcast(VT, Result);

+ }

// Since SSE has no unsigned integer comparisons, we need to flip the sign

// bits of the inputs before performing those operations. The lower

// compare is always unsigned.

@@ -20999,8 +21812,9 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

/// corresponding X86 condition code constant in X86CC.

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

ISD::CondCode CC, const SDLoc &dl,

- SelectionDAG &DAG,

- SDValue &X86CC) const {

+ SelectionDAG &DAG, SDValue &X86CC,

+ SDValue &Chain,

+ bool IsSignaling) const {

// Optimize to BT if possible.

// Lower (X & (1 << N)) == 0 to BT(X, N).

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

@@ -21043,12 +21857,32 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

}

+ // Try to use the carry flag from the add in place of an separate CMP for:

+ // (seteq (add X, -1), -1). Similar for setne.

+ if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

+ Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

+ if (isProfitableToUseFlagOp(Op0)) {

+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

+ SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

+ Op0.getOperand(1));

+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

+ X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

+ return SDValue(New.getNode(), 1);

+ }

bool IsFP = Op1.getSimpleValueType().isFloatingPoint();

X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);

if (CondCode == X86::COND_INVALID)

return SDValue();

- SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);

+ std::pair<SDValue, SDValue> Tmp =

+ EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);

+ SDValue EFLAGS = Tmp.first;

+ if (Chain)

+ Chain = Tmp.second;

EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

return EFLAGS;

@@ -21056,35 +21890,48 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

- MVT VT = Op.getSimpleValueType();

+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

+ Op.getOpcode() == ISD::STRICT_FSETCCS;

+ MVT VT = Op->getSimpleValueType(0);

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");

- SDValue Op0 = Op.getOperand(0);

- SDValue Op1 = Op.getOperand(1);

+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

SDLoc dl(Op);

- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

+ ISD::CondCode CC =

+ cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

// Handle f128 first, since one possible outcome is a normal integer

// comparison which gets handled by emitFlagsForSetcc.

if (Op0.getValueType() == MVT::f128) {

- softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);

+ softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

+ Op.getOpcode() == ISD::STRICT_FSETCCS);

// If softenSetCCOperands returned a scalar, use it.

if (!Op1.getNode()) {

assert(Op0.getValueType() == Op.getValueType() &&

"Unexpected setcc expansion!");

+ if (IsStrict)

+ return DAG.getMergeValues({Op0, Chain}, dl);

return Op0;

}

SDValue X86CC;

- SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,

+ Op.getOpcode() == ISD::STRICT_FSETCCS);

if (!EFLAGS)

return SDValue();

- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, dl);

+ return Res;

}

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

@@ -21215,8 +22062,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

(Subtarget.hasSSE1() && VT == MVT::f32)) &&

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

- unsigned SSECC = translateX86FSETCC(

- cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

+ bool IsAlwaysSignaling;

+ unsigned SSECC =

+ translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

+ CondOp0, CondOp1, IsAlwaysSignaling);

if (Subtarget.hasAVX512()) {

SDValue Cmp =

@@ -21454,8 +22303,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

if (AddTest) {

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

- Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),

- X86::COND_NE, DL, DAG);

+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

}

// a < b ? -1 : 0 -> RES = ~setcc_carry

@@ -21711,7 +22559,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

assert(VT.isVector() && InVT.isVector() && "Expected vector type");

- assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&

+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&

"Expected same number of elements");

assert((VT.getVectorElementType() == MVT::i16 ||

VT.getVectorElementType() == MVT::i32 ||

@@ -21765,12 +22613,14 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

"Expecting 256/512-bit op");

// Splitting volatile memory ops is not allowed unless the operation was not

- // legal to begin with. We are assuming the input op is legal (this transform

- // is only used for targets with AVX).

+ // legal to begin with. Assume the input store is legal (this transform is

+ // only used for targets with AVX). Note: It is possible that we have an

+ // illegal type like v2i128, and so we could allow splitting a volatile store

+ // in that case if that is important.

if (!Store->isSimple())

return SDValue();

- MVT StoreVT = StoredVal.getSimpleValueType();

+ EVT StoreVT = StoredVal.getValueType();

unsigned NumElems = StoreVT.getVectorNumElements();

unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;

unsigned HalfAlign = (128 == HalfSize ? 16 : 32);

@@ -22174,8 +23024,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

if (addTest) {

X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;

CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

- Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),

- X86Cond, dl, DAG);

+ Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);

}

Cond = ConvertCmpIfNecessary(Cond, DAG);

return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

@@ -22201,7 +23050,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

SDNode *Node = Op.getNode();

SDValue Chain = Op.getOperand(0);

SDValue Size = Op.getOperand(1);

- unsigned Align = Op.getConstantOperandVal(2);

+ MaybeAlign Alignment(Op.getConstantOperandVal(2));

EVT VT = Node->getValueType(0);

// Chain the dynamic stack allocation so that it doesn't modify the stack

@@ -22221,11 +23070,12 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

Chain = SP.getValue(1);

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

- unsigned StackAlign = TFI.getStackAlignment();

+ const Align StackAlign(TFI.getStackAlignment());

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

- if (Align > StackAlign)

- Result = DAG.getNode(ISD::AND, dl, VT, Result,

- DAG.getConstant(-(uint64_t)Align, dl, VT));

+ if (Alignment && Alignment > StackAlign)

+ Result =

+ DAG.getNode(ISD::AND, dl, VT, Result,

+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

} else if (SplitStack) {

MachineRegisterInfo &MRI = MF.getRegInfo();

@@ -22256,9 +23106,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

Chain = SP.getValue(1);

- if (Align) {

+ if (Alignment) {

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

- DAG.getConstant(-(uint64_t)Align, dl, VT));

+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

}

@@ -22777,6 +23627,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

unsigned IntNo = Op.getConstantOperandVal(0);

MVT VT = Op.getSimpleValueType();

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

if (IntrData) {

switch(IntrData->Type) {

case INTR_TYPE_1OP: {

@@ -22794,7 +23645,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

if (!isRoundModeCurDirection(Rnd))

return SDValue();

}

- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));

+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

+ Op.getOperand(1));

}

case INTR_TYPE_1OP_SAE: {

SDValue Sae = Op.getOperand(2);

@@ -22866,7 +23718,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

}

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

- Src1, Src2, Src3);

+ {Src1, Src2, Src3});

}

case INTR_TYPE_4OP:

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),

@@ -22890,8 +23742,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

if (!isRoundModeCurDirection(Rnd))

return SDValue();

}

- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),

- Mask, PassThru, Subtarget, DAG);

+ return getVectorMaskingNode(

+ DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

+ Subtarget, DAG);

}

case INTR_TYPE_1OP_MASK_SAE: {

SDValue Src = Op.getOperand(1);

@@ -22907,8 +23760,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

else

return SDValue();

- return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),

- Mask, PassThru, Subtarget, DAG);

+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

+ Subtarget, DAG);

}

case INTR_TYPE_SCALAR_MASK: {

SDValue Src1 = Op.getOperand(1);

@@ -23114,8 +23967,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

return SDValue();

}

//default rounding mode

- return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),

- Op.getOperand(2), CC);

+ return DAG.getNode(IntrData->Opc0, dl, MaskVT,

+ {Op.getOperand(1), Op.getOperand(2), CC});

}

case CMP_MASK_SCALAR_CC: {

SDValue Src1 = Op.getOperand(1);

@@ -23315,8 +24168,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

MVT SrcVT = Src.getSimpleValueType();

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

- return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

- Mask);

+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

+ {Src, PassThru, Mask});

}

case CVTPS2PH_MASK: {

SDValue Src = Op.getOperand(1);

@@ -23622,9 +24475,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

SDValue ShAmt = Op.getOperand(2);

// If the argument is a constant, convert it to a target constant.

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

- ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32);

+ // Clamp out of bounds shift amounts since they will otherwise be masked

+ // to 8-bits which may make it no longer out of bounds.

+ unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

- Op.getOperand(0), Op.getOperand(1), ShAmt);

+ Op.getOperand(0), Op.getOperand(1),

+ DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

}

unsigned NewIntrinsic;

@@ -23977,7 +24833,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

MFI.setHasCopyImplyingStackAdjustment(true);

// Don't do anything here, we will expand these intrinsics out later

// during FinalizeISel in EmitInstrWithCustomInserter.

- return SDValue();

+ return Op;

}

case Intrinsic::x86_lwpins32:

case Intrinsic::x86_lwpins64:

@@ -24152,9 +25008,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

+ SDValue Offset = DAG.getUNDEF(VMask.getValueType());

- return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,

- MemIntr->getMemOperand(), true /* truncating */);

+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

+ MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

+ true /* truncating */);

}

case X86ISD::VTRUNCUS:

case X86ISD::VTRUNCS: {

@@ -24249,7 +25107,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

// FIXME? Maybe this could be a TableGen attribute on some registers and

// this table could be generated automatically from RegInfo.

-Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,

+Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

const MachineFunction &MF) const {

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

@@ -24538,12 +25396,13 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

MachineFunction &MF = DAG.getMachineFunction();

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

- unsigned StackAlignment = TFI.getStackAlignment();

+ const Align StackAlignment(TFI.getStackAlignment());

MVT VT = Op.getSimpleValueType();

SDLoc DL(Op);

// Save FP Control Word to stack slot

- int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);

+ int SSFI =

+ MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);

SDValue StackSlot =

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

@@ -27464,12 +28323,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

return Op;

- SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),

- N->getBasePtr(), Mask,

- getZeroVector(VT, Subtarget, DAG, dl),

- N->getMemoryVT(), N->getMemOperand(),

- N->getExtensionType(),

- N->isExpandingLoad());

+ SDValue NewLoad = DAG.getMaskedLoad(

+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

+ getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

+ N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

+ N->isExpandingLoad());

// Emit a blend.

SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,

PassThru);

@@ -27503,11 +28361,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

- SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),

- N->getBasePtr(), Mask, PassThru,

- N->getMemoryVT(), N->getMemOperand(),

- N->getExtensionType(),

- N->isExpandingLoad());

+ SDValue NewLoad = DAG.getMaskedLoad(

+ WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

+ PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

+ N->getExtensionType(), N->isExpandingLoad());

SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,

NewLoad.getValue(0),

@@ -27553,7 +28410,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

- Mask, N->getMemoryVT(), N->getMemOperand(),

+ N->getOffset(), Mask, N->getMemoryVT(),

+ N->getMemOperand(), N->getAddressingMode(),

N->isTruncatingStore(), N->isCompressingStore());

}

@@ -27607,29 +28465,31 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);

}

-SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,

- SelectionDAG &DAG) const {

- // TODO: Eventually, the lowering of these nodes should be informed by or

- // deferred to the GC strategy for the function in which they appear. For

- // now, however, they must be lowered to something. Since they are logically

- // no-ops in the case of a null GC strategy (or a GC strategy which does not

- // require special handling for these nodes), lower them as literal NOOPs for

- // the time being.

- SmallVector<SDValue, 2> Ops;

+static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

+ SDLoc dl(Op);

+ SDValue Src = Op.getOperand(0);

+ MVT DstVT = Op.getSimpleValueType();

- Ops.push_back(Op.getOperand(0));

- if (Op->getGluedNode())

- Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

+ AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

+ unsigned SrcAS = N->getSrcAddressSpace();

- SDLoc OpDL(Op);

- SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

- SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

+ assert(SrcAS != N->getDestAddressSpace() &&

+ "addrspacecast must be between different address spaces");

- return NOOP;

+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

+ } else if (DstVT == MVT::i64) {

+ Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

+ } else if (DstVT == MVT::i32) {

+ Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

+ } else {

+ report_fatal_error("Bad address space in addrspacecast");

+ }

+ return Op;

}

-SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,

- SelectionDAG &DAG) const {

+SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

+ SelectionDAG &DAG) const {

// TODO: Eventually, the lowering of these nodes should be informed by or

// deferred to the GC strategy for the function in which they appear. For

// now, however, they must be lowered to something. Since they are logically

@@ -27651,9 +28511,21 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,

SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,

RTLIB::Libcall Call) const {

- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());

+ bool IsStrict = Op->isStrictFPOpcode();

+ unsigned Offset = IsStrict ? 1 : 0;

+ SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());

+ SDLoc dl(Op);

+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

MakeLibCallOptions CallOptions;

- return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;

+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,

+ CallOptions, dl, Chain);

+ if (IsStrict)

+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

+ return Tmp.first;

}

/// Provide custom lowering hooks for some operations.

@@ -27673,7 +28545,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

- case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);

+ case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

@@ -27690,7 +28562,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

case ISD::FSHL:

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

+ case ISD::STRICT_SINT_TO_FP:

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

+ case ISD::STRICT_UINT_TO_FP:

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

@@ -27700,21 +28574,24 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::SIGN_EXTEND_VECTOR_INREG:

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

case ISD::FP_TO_SINT:

- case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

- case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

- case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);

- case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG);

+ case ISD::STRICT_FP_TO_SINT:

+ case ISD::FP_TO_UINT:

+ case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

+ case ISD::FP_EXTEND:

+ case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

+ case ISD::FP_ROUND:

+ case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

case ISD::FADD:

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

- case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128);

- case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128);

case ISD::FABS:

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

- case ISD::SETCC: return LowerSETCC(Op, DAG);

+ case ISD::SETCC:

+ case ISD::STRICT_FSETCC:

+ case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

case ISD::SELECT: return LowerSELECT(Op, DAG);

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

@@ -27778,8 +28655,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

case ISD::GC_TRANSITION_START:

- return LowerGC_TRANSITION_START(Op, DAG);

- case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);

+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

+ case ISD::ADDRSPACECAST:

+ return LowerADDRSPACECAST(Op, DAG);

}

@@ -27865,8 +28743,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

}

case X86ISD::VPMADDWD:

case X86ISD::AVG: {

- // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and

- // X86ISD::AVG/VPMADDWD by widening.

+ // Legalize types for X86ISD::AVG/VPMADDWD by widening.

assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

EVT VT = N->getValueType(0);

@@ -28114,10 +28991,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

return;

}

case ISD::FP_TO_SINT:

- case ISD::FP_TO_UINT: {

- bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

+ case ISD::STRICT_FP_TO_SINT:

+ case ISD::FP_TO_UINT:

+ case ISD::STRICT_FP_TO_UINT: {

+ bool IsStrict = N->isStrictFPOpcode();

+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

+ N->getOpcode() == ISD::STRICT_FP_TO_SINT;

EVT VT = N->getValueType(0);

- SDValue Src = N->getOperand(0);

+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);

EVT SrcVT = Src.getValueType();

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

@@ -28128,13 +29009,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

VT.getVectorNumElements());

- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

+ SDValue Res;

+ SDValue Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

+ {N->getOperand(0), Src});

+ Chain = Res.getValue(1);

+ } else

+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

// Preserve what we know about the size of the original result. Except

// when the result is v2i32 since we can't widen the assert.

if (PromoteVT != MVT::v2i32)

- Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext

- : ISD::AssertSext,

+ Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,

dl, PromoteVT, Res,

DAG.getValueType(VT.getVectorElementType()));

@@ -28149,6 +29036,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

ConcatOps[0] = Res;

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

Results.push_back(Res);

+ if (IsStrict)

+ Results.push_back(Chain);

return;

}

@@ -28160,16 +29049,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

"Unexpected type action!");

if (Src.getValueType() == MVT::v2f64) {

+ unsigned Opc;

+ if (IsStrict)

+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

+ else

+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

+ // If we have VLX we can emit a target specific FP_TO_UINT node,.

if (!IsSigned && !Subtarget.hasVLX()) {

- // If we have VLX we can emit a target specific FP_TO_UINT node,

- // otherwise we can defer to the generic legalizer which will widen

+ // Otherwise we can defer to the generic legalizer which will widen

// the input as well. This will be further widened during op

// legalization to v8i32<-v8f64.

- return;

+ // For strict nodes we'll need to widen ourselves.

+ // FIXME: Fix the type legalizer to safely widen strict nodes?

+ if (!IsStrict)

+ return;

+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

+ DAG.getConstantFP(0.0, dl, MVT::v2f64));

+ Opc = N->getOpcode();

+ }

+ SDValue Res;

+ SDValue Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

+ {N->getOperand(0), Src});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

}

- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

- SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

Results.push_back(Res);

+ if (IsStrict)

+ Results.push_back(Chain);

+ return;

+ }

+ // Custom widen strict v2f32->v2i32 by padding with zeros.

+ // FIXME: Should generic type legalizer do this?

+ if (Src.getValueType() == MVT::v2f32 && IsStrict) {

+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

+ DAG.getConstantFP(0.0, dl, MVT::v2f32));

+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

+ {N->getOperand(0), Src});

+ Results.push_back(Res);

+ Results.push_back(Res.getValue(1));

return;

}

@@ -28183,64 +29105,168 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

if (Subtarget.hasDQI() && VT == MVT::i64 &&

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {

assert(!Subtarget.is64Bit() && "i64 should be legal");

- unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

- // Using a 256-bit input here to guarantee 128-bit input for f32 case.

- // TODO: Use 128-bit vectors for f64 case?

- // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.

+ unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

+ // If we use a 128-bit result we might need to use a target specific node.

+ unsigned SrcElts =

+ std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

- MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);

+ MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

+ unsigned Opc = N->getOpcode();

+ if (NumElts != SrcElts) {

+ if (IsStrict)

+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

+ else

+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

+ }

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

DAG.getConstantFP(0.0, dl, VecInVT), Src,

ZeroIdx);

- Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);

+ SDValue Chain;

+ if (IsStrict) {

+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

+ Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

+ Chain = Res.getValue(1);

+ } else

+ Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

Results.push_back(Res);

+ if (IsStrict)

+ Results.push_back(Chain);

return;

}

- if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))

+ SDValue Chain;

+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

Results.push_back(V);

+ if (IsStrict)

+ Results.push_back(Chain);

+ }

return;

}

- case ISD::SINT_TO_FP: {

- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");

- SDValue Src = N->getOperand(0);

- if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)

- return;

- Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));

- return;

- }

- case ISD::UINT_TO_FP: {

- assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

+ case ISD::SINT_TO_FP:

+ case ISD::STRICT_SINT_TO_FP:

+ case ISD::UINT_TO_FP:

+ case ISD::STRICT_UINT_TO_FP: {

+ bool IsStrict = N->isStrictFPOpcode();

+ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

+ N->getOpcode() == ISD::STRICT_SINT_TO_FP;

EVT VT = N->getValueType(0);

if (VT != MVT::v2f32)

return;

- SDValue Src = N->getOperand(0);

+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);

EVT SrcVT = Src.getValueType();

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

- Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));

+ if (IsStrict) {

+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

+ : X86ISD::STRICT_CVTUI2P;

+ SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

+ {N->getOperand(0), Src});

+ Results.push_back(Res);

+ Results.push_back(Res.getValue(1));

+ } else {

+ unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

+ Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

+ }

return;

}

+ if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

+ Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

+ SDValue Zero = DAG.getConstant(0, dl, SrcVT);

+ SDValue One = DAG.getConstant(1, dl, SrcVT);

+ SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

+ DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

+ DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

+ SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

+ SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

+ SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

+ for (int i = 0; i != 2; ++i) {

+ SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

+ SignSrc, DAG.getIntPtrConstant(i, dl));

+ if (IsStrict)

+ SignCvts[i] =

+ DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

+ {N->getOperand(0), Src});

+ else

+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);

+ };

+ SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

+ SDValue Slow, Chain;

+ if (IsStrict) {

+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

+ SignCvts[0].getValue(1), SignCvts[1].getValue(1));

+ Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

+ {Chain, SignCvt, SignCvt});

+ Chain = Slow.getValue(1);

+ } else {

+ Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

+ }

+ IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

+ IsNeg =

+ DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

+ SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

+ Results.push_back(Cvt);

+ if (IsStrict)

+ Results.push_back(Chain);

+ return;

+ }

if (SrcVT != MVT::v2i32)

return;

+ if (IsSigned || Subtarget.hasAVX512()) {

+ if (!IsStrict)

+ return;

+ // Custom widen strict v2i32->v2f32 to avoid scalarization.

+ // FIXME: Should generic type legalizer do this?

+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

+ DAG.getConstant(0, dl, MVT::v2i32));

+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

+ {N->getOperand(0), Src});

+ Results.push_back(Res);

+ Results.push_back(Res.getValue(1));

+ return;

+ }

+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

SDValue VBias =

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

DAG.getBitcast(MVT::v2i64, VBias));

Or = DAG.getBitcast(MVT::v2f64, Or);

- // TODO: Are there any fast-math-flags to propagate here?

- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

- Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

+ if (IsStrict) {

+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

+ {N->getOperand(0), Or, VBias});

+ SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

+ {MVT::v4f32, MVT::Other},

+ {Sub.getValue(1), Sub});

+ Results.push_back(Res);

+ Results.push_back(Res.getValue(1));

+ } else {

+ // TODO: Are there any fast-math-flags to propagate here?

+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

+ }

return;

}

+ case ISD::STRICT_FP_ROUND:

case ISD::FP_ROUND: {

- if (!isTypeLegal(N->getOperand(0).getValueType()))

- return;

- SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));

+ bool IsStrict = N->isStrictFPOpcode();

+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);

+ if (!isTypeLegal(Src.getValueType()))

+ return;

+ SDValue V;

+ if (IsStrict)

+ V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},

+ {N->getOperand(0), N->getOperand(1)});

+ else

+ V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));

Results.push_back(V);

+ if (IsStrict)

+ Results.push_back(V.getValue(1));

return;

}

case ISD::FP_EXTEND: {

@@ -28543,6 +29569,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

Results.push_back(Res.getValue(1));

return;

}

+ case ISD::ADDRSPACECAST: {

+ SDValue Src = N->getOperand(0);

+ EVT DstVT = N->getValueType(0);

+ AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);

+ unsigned SrcAS = CastN->getSrcAddressSpace();

+ assert(SrcAS != CastN->getDestAddressSpace() &&

+ "addrspacecast must be between different address spaces");

+ SDValue Res;

+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)

+ Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

+ else if (DstVT == MVT::i64)

+ Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

+ else if (DstVT == MVT::i32)

+ Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

+ else

+ report_fatal_error("Unrecognized addrspacecast type legalization");

+ Results.push_back(Res);

+ return;

+ }

}

@@ -28566,9 +29614,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

case X86ISD::CALL: return "X86ISD::CALL";

case X86ISD::BT: return "X86ISD::BT";

case X86ISD::CMP: return "X86ISD::CMP";

+ case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";

+ case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";

case X86ISD::COMI: return "X86ISD::COMI";

case X86ISD::UCOMI: return "X86ISD::UCOMI";

case X86ISD::CMPM: return "X86ISD::CMPM";

+ case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";

case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";

case X86ISD::SETCC: return "X86ISD::SETCC";

case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";

@@ -28653,10 +29704,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";

case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";

case X86ISD::VFPEXT: return "X86ISD::VFPEXT";

+ case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";

case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";

case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";

case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";

case X86ISD::VFPROUND: return "X86ISD::VFPROUND";

+ case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";

case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";

case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";

case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";

@@ -28676,6 +29729,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

case X86ISD::VROTRI: return "X86ISD::VROTRI";

case X86ISD::VPPERM: return "X86ISD::VPPERM";

case X86ISD::CMPP: return "X86ISD::CMPP";

+ case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";

case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";

case X86ISD::PCMPGT: return "X86ISD::PCMPGT";

case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";

@@ -28776,6 +29830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";

case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";

case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";

+ case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";

case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";

case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";

case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";

@@ -28837,6 +29892,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";

case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";

case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";

+ case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";

+ case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";

case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";

case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";

case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";

@@ -28847,6 +29904,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";

case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";

case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";

+ case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";

+ case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";

case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";

case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";

case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";

@@ -29099,8 +30158,8 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

return true;

}

-bool

-X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {

+bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

+ EVT VT) const {

if (!Subtarget.hasAnyFMA())

return false;

@@ -31518,28 +32577,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

case X86ISD::VSRAI:

case X86ISD::VSHLI:

case X86ISD::VSRLI: {

- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

- if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {

- Known.setAllZero();

- break;

- }

+ unsigned ShAmt = Op.getConstantOperandVal(1);

+ if (ShAmt >= VT.getScalarSizeInBits()) {

+ Known.setAllZero();

+ break;

+ }

- Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

- unsigned ShAmt = ShiftImm->getZExtValue();

- if (Opc == X86ISD::VSHLI) {

- Known.Zero <<= ShAmt;

- Known.One <<= ShAmt;

- // Low bits are known zero.

- Known.Zero.setLowBits(ShAmt);

- } else if (Opc == X86ISD::VSRLI) {

- Known.Zero.lshrInPlace(ShAmt);

- Known.One.lshrInPlace(ShAmt);

- // High bits are known zero.

- Known.Zero.setHighBits(ShAmt);

- } else {

- Known.Zero.ashrInPlace(ShAmt);

- Known.One.ashrInPlace(ShAmt);

- }

+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

+ if (Opc == X86ISD::VSHLI) {

+ Known.Zero <<= ShAmt;

+ Known.One <<= ShAmt;

+ // Low bits are known zero.

+ Known.Zero.setLowBits(ShAmt);

+ } else if (Opc == X86ISD::VSRLI) {

+ Known.Zero.lshrInPlace(ShAmt);

+ Known.One.lshrInPlace(ShAmt);

+ // High bits are known zero.

+ Known.Zero.setHighBits(ShAmt);

+ } else {

+ Known.Zero.ashrInPlace(ShAmt);

+ Known.One.ashrInPlace(ShAmt);

}

break;

}

@@ -32103,8 +33160,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

- if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

- Subtarget)) {

+ if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

+ Subtarget)) {

DstVT = MaskVT;

return true;

}

@@ -32116,8 +33173,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

- if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,

- DAG, Subtarget)) {

+ if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

+ Subtarget)) {

SrcVT = DstVT = MaskVT;

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

@@ -32155,8 +33212,8 @@ static bool matchBinaryPermuteShuffle(

uint64_t BlendMask = 0;

bool ForceV1Zero = false, ForceV2Zero = false;

SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());

- if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,

- ForceV2Zero, BlendMask)) {

+ if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,

+ ForceV2Zero, BlendMask)) {

if (MaskVT == MVT::v16i16) {

// We can only use v16i16 PBLENDW if the lanes are repeated.

SmallVector<int, 8> RepeatedMask;

@@ -32410,10 +33467,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

// Determine zeroable mask elements.

- APInt Zeroable(NumMaskElts, 0);

- for (unsigned i = 0; i != NumMaskElts; ++i)

- if (isUndefOrZero(Mask[i]))

- Zeroable.setBit(i);

+ APInt KnownUndef, KnownZero;

+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

+ APInt Zeroable = KnownUndef | KnownZero;

if (UnaryShuffle) {

// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load

@@ -32834,7 +33890,8 @@ static SDValue combineX86ShuffleChainWithExtract(

Offset += Src.getConstantOperandVal(1);

Src = Src.getOperand(0);

}

- WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());

+ WideSizeInBits = std::max(WideSizeInBits,

+ (unsigned)Src.getValueSizeInBits());

assert((Offset % BaseVT.getVectorNumElements()) == 0 &&

"Unexpected subvector extraction");

Offset /= BaseVT.getVectorNumElements();

@@ -33026,6 +34083,10 @@ static SDValue combineX86ShufflesRecursively(

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

+ assert(RootMask.size() > 0 &&

+ (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&

+ "Illegal shuffle root mask");

// Bound the depth of our recursive combine because this is ultimately

// quadratic in nature.

const unsigned MaxRecursionDepth = 8;

@@ -33056,106 +34117,137 @@ static SDValue combineX86ShufflesRecursively(

OpZero, DAG, Depth, false))

return SDValue();

- resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

- // Add the inputs to the Ops list, avoiding duplicates.

- SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

- auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

- // Attempt to find an existing match.

- SDValue InputBC = peekThroughBitcasts(Input);

- for (int i = 0, e = Ops.size(); i < e; ++i)

- if (InputBC == peekThroughBitcasts(Ops[i]))

- return i;

- // Match failed - should we replace an existing Op?

- if (InsertionPoint >= 0) {

- Ops[InsertionPoint] = Input;

- return InsertionPoint;

+ SmallVector<int, 64> Mask;

+ SmallVector<SDValue, 16> Ops;

+ // We don't need to merge masks if the root is empty.

+ bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

+ if (EmptyRoot) {

+ // Only resolve zeros if it will remove an input, otherwise we might end

+ // up in an infinite loop.

+ bool ResolveKnownZeros = true;

+ if (!OpZero.isNullValue()) {

+ APInt UsedInputs = APInt::getNullValue(OpInputs.size());

+ for (int i = 0, e = OpMask.size(); i != e; ++i) {

+ int M = OpMask[i];

+ if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

+ continue;

+ UsedInputs.setBit(M / OpMask.size());

+ if (UsedInputs.isAllOnesValue()) {

+ ResolveKnownZeros = false;

+ break;

+ }

}

- // Add to the end of the Ops list.

- Ops.push_back(Input);

- return Ops.size() - 1;

- };

+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

+ ResolveKnownZeros);

- SmallVector<int, 2> OpInputIdx;

- for (SDValue OpInput : OpInputs)

- OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

- assert(((RootMask.size() > OpMask.size() &&

- RootMask.size() % OpMask.size() == 0) ||

- (OpMask.size() > RootMask.size() &&

- OpMask.size() % RootMask.size() == 0) ||

- OpMask.size() == RootMask.size()) &&

- "The smaller number of elements must divide the larger.");

- // This function can be performance-critical, so we rely on the power-of-2

- // knowledge that we have about the mask sizes to replace div/rem ops with

- // bit-masks and shifts.

- assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");

- assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");

- unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());

- unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

- unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

- unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

- unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

- assert((RootRatio == 1 || OpRatio == 1) &&

- "Must not have a ratio for both incoming and op masks!");

- assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");

- assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");

- assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");

- unsigned RootRatioLog2 = countTrailingZeros(RootRatio);

- unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

- SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

- // Merge this shuffle operation's mask into our accumulated mask. Note that

- // this shuffle's mask will be the first applied to the input, followed by the

- // root mask to get us all the way to the root value arrangement. The reason

- // for this order is that we are recursing up the operation chain.

- for (unsigned i = 0; i < MaskWidth; ++i) {

- unsigned RootIdx = i >> RootRatioLog2;

- if (RootMask[RootIdx] < 0) {

- // This is a zero or undef lane, we're done.

- Mask[i] = RootMask[RootIdx];

- continue;

- }

+ Mask = OpMask;

+ Ops.append(OpInputs.begin(), OpInputs.end());

+ } else {

+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

+ // Add the inputs to the Ops list, avoiding duplicates.

+ Ops.append(SrcOps.begin(), SrcOps.end());

+ auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

+ // Attempt to find an existing match.

+ SDValue InputBC = peekThroughBitcasts(Input);

+ for (int i = 0, e = Ops.size(); i < e; ++i)

+ if (InputBC == peekThroughBitcasts(Ops[i]))

+ return i;

+ // Match failed - should we replace an existing Op?

+ if (InsertionPoint >= 0) {

+ Ops[InsertionPoint] = Input;

+ return InsertionPoint;

+ }

+ // Add to the end of the Ops list.

+ Ops.push_back(Input);

+ return Ops.size() - 1;

+ };

- unsigned RootMaskedIdx =

- RootRatio == 1

- ? RootMask[RootIdx]

- : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

+ SmallVector<int, 2> OpInputIdx;

+ for (SDValue OpInput : OpInputs)

+ OpInputIdx.push_back(

+ AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

+ assert(((RootMask.size() > OpMask.size() &&

+ RootMask.size() % OpMask.size() == 0) ||

+ (OpMask.size() > RootMask.size() &&

+ OpMask.size() % RootMask.size() == 0) ||

+ OpMask.size() == RootMask.size()) &&

+ "The smaller number of elements must divide the larger.");

+ // This function can be performance-critical, so we rely on the power-of-2

+ // knowledge that we have about the mask sizes to replace div/rem ops with

+ // bit-masks and shifts.

+ assert(isPowerOf2_32(RootMask.size()) &&

+ "Non-power-of-2 shuffle mask sizes");

+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");

+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());

+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

+ unsigned RootRatio =

+ std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

+ assert((RootRatio == 1 || OpRatio == 1) &&

+ "Must not have a ratio for both incoming and op masks!");

+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");

+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");

+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");

+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);

+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

+ Mask.resize(MaskWidth, SM_SentinelUndef);

+ // Merge this shuffle operation's mask into our accumulated mask. Note that

+ // this shuffle's mask will be the first applied to the input, followed by

+ // the root mask to get us all the way to the root value arrangement. The

+ // reason for this order is that we are recursing up the operation chain.

+ for (unsigned i = 0; i < MaskWidth; ++i) {

+ unsigned RootIdx = i >> RootRatioLog2;

+ if (RootMask[RootIdx] < 0) {

+ // This is a zero or undef lane, we're done.

+ Mask[i] = RootMask[RootIdx];

+ continue;

+ }

- // Just insert the scaled root mask value if it references an input other

- // than the SrcOp we're currently inserting.

- if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

- (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

- Mask[i] = RootMaskedIdx;

- continue;

- }

+ unsigned RootMaskedIdx =

+ RootRatio == 1

+ ? RootMask[RootIdx]

+ : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

- RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

- unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

- if (OpMask[OpIdx] < 0) {

- // The incoming lanes are zero or undef, it doesn't matter which ones we

- // are using.

- Mask[i] = OpMask[OpIdx];

- continue;

- }

+ // Just insert the scaled root mask value if it references an input other

+ // than the SrcOp we're currently inserting.

+ if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

+ (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

+ Mask[i] = RootMaskedIdx;

+ continue;

+ }

- // Ok, we have non-zero lanes, map them through to one of the Op's inputs.

- unsigned OpMaskedIdx =

- OpRatio == 1

- ? OpMask[OpIdx]

- : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

+ RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

+ if (OpMask[OpIdx] < 0) {

+ // The incoming lanes are zero or undef, it doesn't matter which ones we

+ // are using.

+ Mask[i] = OpMask[OpIdx];

+ continue;

+ }

- OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

- int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

- assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");

- OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

+ // Ok, we have non-zero lanes, map them through to one of the Op's inputs.

+ unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

+ : (OpMask[OpIdx] << OpRatioLog2) +

+ (RootMaskedIdx & (OpRatio - 1));

- Mask[i] = OpMaskedIdx;

+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");

+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

+ Mask[i] = OpMaskedIdx;

+ }

}

// Remove unused/repeated shuffle source ops.

@@ -33189,13 +34281,18 @@ static SDValue combineX86ShufflesRecursively(

// the remaining recursion depth.

if (Ops.size() < (MaxRecursionDepth - Depth)) {

for (int i = 0, e = Ops.size(); i < e; ++i) {

+ // For empty roots, we need to resolve zeroable elements before combining

+ // them with other shuffles.

+ SmallVector<int, 64> ResolvedMask = Mask;

+ if (EmptyRoot)

+ resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

bool AllowVar = false;

if (Ops[i].getNode()->hasOneUse() ||

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))

AllowVar = AllowVariableMask;

if (SDValue Res = combineX86ShufflesRecursively(

- Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,

- AllowVar, DAG, Subtarget))

+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,

+ HasVariableMask, AllowVar, DAG, Subtarget))

return Res;

}

@@ -34207,6 +35304,15 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

In.getOperand(0).getValueType() == MVT::v2i64)

return N->getOperand(0); // return the bitcast

break;

+ case X86ISD::STRICT_CVTTP2SI:

+ case X86ISD::STRICT_CVTTP2UI:

+ case X86ISD::STRICT_CVTSI2P:

+ case X86ISD::STRICT_CVTUI2P:

+ case X86ISD::STRICT_VFPROUND:

+ if (In.getOperand(1).getValueType() == MVT::v2f64 ||

+ In.getOperand(1).getValueType() == MVT::v2i64)

+ return N->getOperand(0);

+ break;

}

@@ -34698,6 +35804,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

return true;

}

+ // If we don't demand all elements, then attempt to combine to a simpler

+ // shuffle.

+ // TODO: Handle other depths, but first we need to handle the fact that

+ // it might combine to the same shuffle.

+ if (!DemandedElts.isAllOnesValue() && Depth == 0) {

+ SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

+ for (int i = 0; i != NumElts; ++i)

+ if (DemandedElts[i])

+ DemandedMask[i] = i;

+ SDValue NewShuffle = combineX86ShufflesRecursively(

+ {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,

+ /*AllowVarMask*/ true, TLO.DAG, Subtarget);

+ if (NewShuffle)

+ return TLO.CombineTo(Op, NewShuffle);

+ }

return false;

}

@@ -34739,117 +35862,110 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

}

case X86ISD::VSHLI: {

SDValue Op0 = Op.getOperand(0);

- SDValue Op1 = Op.getOperand(1);

- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {

- if (ShiftImm->getAPIntValue().uge(BitWidth))

- break;

+ unsigned ShAmt = Op.getConstantOperandVal(1);

+ if (ShAmt >= BitWidth)

+ break;

- unsigned ShAmt = ShiftImm->getZExtValue();

- APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

- // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

- // single shift. We can do this if the bottom bits (which are shifted

- // out) are never demanded.

- if (Op0.getOpcode() == X86ISD::VSRLI &&

- OriginalDemandedBits.countTrailingZeros() >= ShAmt) {

- if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {

- if (Shift2Imm->getAPIntValue().ult(BitWidth)) {

- int Diff = ShAmt - Shift2Imm->getZExtValue();

- if (Diff == 0)

- return TLO.CombineTo(Op, Op0.getOperand(0));

- unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

- SDValue NewShift = TLO.DAG.getNode(

- NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

- TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

- return TLO.CombineTo(Op, NewShift);

- }

+ APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

+ // single shift. We can do this if the bottom bits (which are shifted

+ // out) are never demanded.

+ if (Op0.getOpcode() == X86ISD::VSRLI &&

+ OriginalDemandedBits.countTrailingZeros() >= ShAmt) {

+ unsigned Shift2Amt = Op0.getConstantOperandVal(1);

+ if (Shift2Amt < BitWidth) {

+ int Diff = ShAmt - Shift2Amt;

+ if (Diff == 0)

+ return TLO.CombineTo(Op, Op0.getOperand(0));

+ unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

+ SDValue NewShift = TLO.DAG.getNode(

+ NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

+ TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

+ return TLO.CombineTo(Op, NewShift);

}

+ }

- if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

- TLO, Depth + 1))

- return true;

+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

+ TLO, Depth + 1))

+ return true;

- assert(!Known.hasConflict() && "Bits known to be one AND zero?");

- Known.Zero <<= ShAmt;

- Known.One <<= ShAmt;

+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");

+ Known.Zero <<= ShAmt;

+ Known.One <<= ShAmt;

- // Low bits known zero.

- Known.Zero.setLowBits(ShAmt);

- }

+ // Low bits known zero.

+ Known.Zero.setLowBits(ShAmt);

break;

}

case X86ISD::VSRLI: {

- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

- if (ShiftImm->getAPIntValue().uge(BitWidth))

- break;

+ unsigned ShAmt = Op.getConstantOperandVal(1);

+ if (ShAmt >= BitWidth)

+ break;

- unsigned ShAmt = ShiftImm->getZExtValue();

- APInt DemandedMask = OriginalDemandedBits << ShAmt;

+ APInt DemandedMask = OriginalDemandedBits << ShAmt;

- if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

- OriginalDemandedElts, Known, TLO, Depth + 1))

- return true;

+ if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

+ OriginalDemandedElts, Known, TLO, Depth + 1))

+ return true;

- assert(!Known.hasConflict() && "Bits known to be one AND zero?");

- Known.Zero.lshrInPlace(ShAmt);

- Known.One.lshrInPlace(ShAmt);

+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");

+ Known.Zero.lshrInPlace(ShAmt);

+ Known.One.lshrInPlace(ShAmt);

- // High bits known zero.

- Known.Zero.setHighBits(ShAmt);

- }

+ // High bits known zero.

+ Known.Zero.setHighBits(ShAmt);

break;

}

case X86ISD::VSRAI: {

SDValue Op0 = Op.getOperand(0);

SDValue Op1 = Op.getOperand(1);

- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {

- if (ShiftImm->getAPIntValue().uge(BitWidth))

- break;

+ unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

+ if (ShAmt >= BitWidth)

+ break;

- unsigned ShAmt = ShiftImm->getZExtValue();

- APInt DemandedMask = OriginalDemandedBits << ShAmt;

+ APInt DemandedMask = OriginalDemandedBits << ShAmt;

- // If we just want the sign bit then we don't need to shift it.

- if (OriginalDemandedBits.isSignMask())

- return TLO.CombineTo(Op, Op0);

+ // If we just want the sign bit then we don't need to shift it.

+ if (OriginalDemandedBits.isSignMask())

+ return TLO.CombineTo(Op, Op0);

- // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

- if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {

- SDValue Op00 = Op0.getOperand(0);

- unsigned NumSignBits =

- TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

- if (ShAmt < NumSignBits)

- return TLO.CombineTo(Op, Op00);

- }

+ // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

+ if (Op0.getOpcode() == X86ISD::VSHLI &&

+ Op.getOperand(1) == Op0.getOperand(1)) {

+ SDValue Op00 = Op0.getOperand(0);

+ unsigned NumSignBits =

+ TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

+ if (ShAmt < NumSignBits)

+ return TLO.CombineTo(Op, Op00);

+ }

- // If any of the demanded bits are produced by the sign extension, we also

- // demand the input sign bit.

- if (OriginalDemandedBits.countLeadingZeros() < ShAmt)

- DemandedMask.setSignBit();

+ // If any of the demanded bits are produced by the sign extension, we also

+ // demand the input sign bit.

+ if (OriginalDemandedBits.countLeadingZeros() < ShAmt)

+ DemandedMask.setSignBit();

- if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

- TLO, Depth + 1))

- return true;

+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

+ TLO, Depth + 1))

+ return true;

- assert(!Known.hasConflict() && "Bits known to be one AND zero?");

- Known.Zero.lshrInPlace(ShAmt);

- Known.One.lshrInPlace(ShAmt);

+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");

+ Known.Zero.lshrInPlace(ShAmt);

+ Known.One.lshrInPlace(ShAmt);

- // If the input sign bit is known to be zero, or if none of the top bits

- // are demanded, turn this into an unsigned shift right.

- if (Known.Zero[BitWidth - ShAmt - 1] ||

- OriginalDemandedBits.countLeadingZeros() >= ShAmt)

- return TLO.CombineTo(

- Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

+ // If the input sign bit is known to be zero, or if none of the top bits

+ // are demanded, turn this into an unsigned shift right.

+ if (Known.Zero[BitWidth - ShAmt - 1] ||

+ OriginalDemandedBits.countLeadingZeros() >= ShAmt)

+ return TLO.CombineTo(

+ Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

- // High bits are known one.

- if (Known.One[BitWidth - ShAmt - 1])

- Known.One.setHighBits(ShAmt);

- }

+ // High bits are known one.

+ if (Known.One[BitWidth - ShAmt - 1])

+ Known.One.setHighBits(ShAmt);

break;

}

case X86ISD::PEXTRB:

@@ -35005,6 +36121,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

return Vec;

break;

}

+ case X86ISD::PCMPGT:

+ // icmp sgt(0, R) == ashr(R, BitWidth-1).

+ // iff we only need the sign bit then we can use R directly.

+ if (DemandedBits.isSignMask() &&

+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

+ return Op.getOperand(1);

+ break;

}

APInt ShuffleUndef, ShuffleZero;

@@ -35053,123 +36176,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

Op, DemandedBits, DemandedElts, DAG, Depth);

}

-/// Check if a vector extract from a target-specific shuffle of a load can be

-/// folded into a single element load.

-/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but

-/// shuffles have been custom lowered so we need to handle those here.

-static SDValue

-XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,

- TargetLowering::DAGCombinerInfo &DCI) {

- if (DCI.isBeforeLegalizeOps())

- return SDValue();

- SDValue InVec = N->getOperand(0);

- SDValue EltNo = N->getOperand(1);

- EVT EltVT = N->getValueType(0);

- if (!isa<ConstantSDNode>(EltNo))

- return SDValue();

- EVT OriginalVT = InVec.getValueType();

- unsigned NumOriginalElts = OriginalVT.getVectorNumElements();

- // Peek through bitcasts, don't duplicate a load with other uses.

- InVec = peekThroughOneUseBitcasts(InVec);

- EVT CurrentVT = InVec.getValueType();

- if (!CurrentVT.isVector())

- return SDValue();

- unsigned NumCurrentElts = CurrentVT.getVectorNumElements();

- if ((NumOriginalElts % NumCurrentElts) != 0)

- return SDValue();

- if (!isTargetShuffle(InVec.getOpcode()))

- return SDValue();

- // Don't duplicate a load with other uses.

- if (!InVec.hasOneUse())

- return SDValue();

- SmallVector<int, 16> ShuffleMask;

- SmallVector<SDValue, 2> ShuffleOps;

- bool UnaryShuffle;

- if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,

- ShuffleOps, ShuffleMask, UnaryShuffle))

- return SDValue();

- unsigned Scale = NumOriginalElts / NumCurrentElts;

- if (Scale > 1) {

- SmallVector<int, 16> ScaledMask;

- scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);

- ShuffleMask = std::move(ScaledMask);

- }

- assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");

- // Select the input vector, guarding against out of range extract vector.

- int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();

- int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];

- if (Idx == SM_SentinelZero)

- return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)

- : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);

- if (Idx == SM_SentinelUndef)

- return DAG.getUNDEF(EltVT);

- // Bail if any mask element is SM_SentinelZero - getVectorShuffle below

- // won't handle it.

- if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))

- return SDValue();

- assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&

- "Shuffle index out of range");

- SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];

- // If inputs to shuffle are the same for both ops, then allow 2 uses

- unsigned AllowedUses =

- (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

- if (LdNode.getOpcode() == ISD::BITCAST) {

- // Don't duplicate a load with other uses.

- if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))

- return SDValue();

- AllowedUses = 1; // only allow 1 load use if we have a bitcast

- LdNode = LdNode.getOperand(0);

- }

- if (!ISD::isNormalLoad(LdNode.getNode()))

- return SDValue();

- LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

- if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())

- return SDValue();

- // If there's a bitcast before the shuffle, check if the load type and

- // alignment is valid.

- unsigned Align = LN0->getAlignment();

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

- unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(

- EltVT.getTypeForEVT(*DAG.getContext()));

- if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))

- return SDValue();

- // All checks match so transform back to vector_shuffle so that DAG combiner

- // can finish the job

- SDLoc dl(N);

- // Create shuffle node taking into account the case that its a unary shuffle

- SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)

- : DAG.getBitcast(OriginalVT, ShuffleOps[1]);

- Shuffle = DAG.getVectorShuffle(OriginalVT, dl,

- DAG.getBitcast(OriginalVT, ShuffleOps[0]),

- Shuffle, ShuffleMask);

- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,

- EltNo);

// Helper to peek through bitops/setcc to determine size of source vector.

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {

@@ -35714,7 +36720,7 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

const X86Subtarget &Subtarget) {

// Find the appropriate width for the PSADBW.

EVT InVT = Zext0.getOperand(0).getValueType();

- unsigned RegSize = std::max(128u, InVT.getSizeInBits());

+ unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

// fill in the missing vector elements with 0.

@@ -36263,6 +37269,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");

+ // We need at least SSE2 to anything here.

+ if (!Subtarget.hasSSE2())

+ return SDValue();

ISD::NodeType Opc;

SDValue Rdx =

DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);

@@ -36382,8 +37392,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

EVT VT = N->getValueType(0);

SDLoc dl(InputVector);

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

+ unsigned NumSrcElts = SrcVT.getVectorNumElements();

- if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))

+ if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

// Integer Constant Folding.

@@ -36419,14 +37430,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

}

// TODO - Remove this once we can handle the implicit zero-extension of

- // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,

- // combineHorizontalPredicateResult and combineBasicSADPattern.

+ // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and

+ // combineBasicSADPattern.

return SDValue();

}

- if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))

- return NewOp;

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

@@ -36482,7 +37490,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

};

if (all_of(InputVector->uses(), IsBoolExtract) &&

BoolExtracts.size() > 1) {

- unsigned NumSrcElts = SrcVT.getVectorNumElements();

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

if (SDValue BC =

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

@@ -36568,9 +37575,8 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

if (TValIsAllZeros || FValIsAllOnes) {

SDValue CC = Cond.getOperand(2);

- ISD::CondCode NewCC =

- ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),

- Cond.getOperand(0).getValueType().isInteger());

+ ISD::CondCode NewCC = ISD::getSetCCInverse(

+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

NewCC);

std::swap(LHS, RHS);

@@ -36761,37 +37767,117 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

if (VT.is512BitVector())

return SDValue();

- // TODO: Add other opcodes eventually lowered into BLEND.

- for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

- UI != UE; ++UI)

- if ((UI->getOpcode() != ISD::VSELECT &&

- UI->getOpcode() != X86ISD::BLENDV) ||

- UI.getOperandNo() != 0)

+ auto OnlyUsedAsSelectCond = [](SDValue Cond) {

+ for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

+ UI != UE; ++UI)

+ if ((UI->getOpcode() != ISD::VSELECT &&

+ UI->getOpcode() != X86ISD::BLENDV) ||

+ UI.getOperandNo() != 0)

+ return false;

+ return true;

+ };

+ if (OnlyUsedAsSelectCond(Cond)) {

+ APInt DemandedMask(APInt::getSignMask(BitWidth));

+ KnownBits Known;

+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

+ !DCI.isBeforeLegalizeOps());

+ if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))

return SDValue();

+ // If we changed the computation somewhere in the DAG, this change will

+ // affect all users of Cond. Update all the nodes so that we do not use

+ // the generic VSELECT anymore. Otherwise, we may perform wrong

+ // optimizations as we messed with the actual expectation for the vector

+ // boolean values.

+ for (SDNode *U : Cond->uses()) {

+ if (U->getOpcode() == X86ISD::BLENDV)

+ continue;

+ SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

+ Cond, U->getOperand(1), U->getOperand(2));

+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

+ DCI.AddToWorklist(U);

+ }

+ DCI.CommitTargetLoweringOpt(TLO);

+ return SDValue(N, 0);

+ }

+ // Otherwise we can still at least try to simplify multiple use bits.

APInt DemandedMask(APInt::getSignMask(BitWidth));

+ APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));

KnownBits Known;

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

!DCI.isBeforeLegalizeOps());

- if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))

+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,

+ DemandedElts, DAG, 0))

+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),

+ V, N->getOperand(1), N->getOperand(2));

+ return SDValue();

+// Try to match:

+// (or (and (M, (sub 0, X)), (pandn M, X)))

+// which is a special case of:

+// (select M, (sub 0, X), X)

+// Per:

+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

+// We know that, if fNegate is 0 or 1:

+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

+//

+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

+// ( M ? -X : X) == ((X ^ M ) + (M & 1))

+// This lets us transform our vselect to:

+// (add (xor X, M), (and M, 1))

+// And further to:

+// (sub (xor X, M), M)

+static SDValue combineLogicBlendIntoConditionalNegate(

+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {

+ EVT MaskVT = Mask.getValueType();

+ assert(MaskVT.isInteger() &&

+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&

+ "Mask must be zero/all-bits");

+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

+ return SDValue();

+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

return SDValue();

- // If we changed the computation somewhere in the DAG, this change will

- // affect all users of Cond. Update all the nodes so that we do not use

- // the generic VSELECT anymore. Otherwise, we may perform wrong

- // optimizations as we messed with the actual expectation for the vector

- // boolean values.

- for (SDNode *U : Cond->uses()) {

- if (U->getOpcode() == X86ISD::BLENDV)

- continue;

+ auto IsNegV = [](SDNode *N, SDValue V) {

+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

+ };

- SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

- Cond, U->getOperand(1), U->getOperand(2));

- DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

- DCI.AddToWorklist(U);

- }

- DCI.CommitTargetLoweringOpt(TLO);

- return SDValue(N, 0);

+ SDValue V;

+ if (IsNegV(Y.getNode(), X))

+ V = X;

+ else if (IsNegV(X.getNode(), Y))

+ V = Y;

+ else

+ return SDValue();

+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

+ SDValue SubOp2 = Mask;

+ // If the negate was on the false side of the select, then

+ // the operands of the SUB need to be swapped. PR 27251.

+ // This is because the pattern being matched above is

+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

+ // but if the pattern matched was

+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern

+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement

+ // pattern also needs to be a negation of the replacement pattern above.

+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

+ // sub accomplishes the negation of the replacement pattern.

+ if (V == Y)

+ std::swap(SubOp1, SubOp2);

+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

+ return DAG.getBitcast(VT, Res);

}

/// Do target-specific dag combines on SELECT and VSELECT nodes.

@@ -36811,10 +37897,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

EVT VT = LHS.getValueType();

EVT CondVT = Cond.getValueType();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

+ // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

+ // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

+ // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

+ if (CondVT.isVector() && CondVT.isInteger() &&

+ CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

+ (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

+ DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

+ if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

+ DL, DAG, Subtarget))

+ return V;

// Convert vselects with constant condition into shuffles.

- if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

- DCI.isBeforeLegalizeOps()) {

+ if (CondConstantVector && DCI.isBeforeLegalizeOps()) {

SmallVector<int, 64> Mask;

if (createShuffleMaskFromVSELECT(Mask, Cond))

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

@@ -36843,7 +37940,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

// the operands would cause it to handle comparisons between positive

// and negative zero incorrectly.

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

- if (!DAG.getTarget().Options.UnsafeFPMath &&

+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

!(DAG.isKnownNeverZeroFloat(LHS) ||

DAG.isKnownNeverZeroFloat(RHS)))

break;

@@ -36854,7 +37951,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

case ISD::SETOLE:

// Converting this to a min would handle comparisons between positive

// and negative zero incorrectly.

- if (!DAG.getTarget().Options.UnsafeFPMath &&

+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

break;

Opcode = X86ISD::FMIN;

@@ -36873,7 +37970,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

case ISD::SETOGE:

// Converting this to a max would handle comparisons between positive

// and negative zero incorrectly.

- if (!DAG.getTarget().Options.UnsafeFPMath &&

+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

break;

Opcode = X86ISD::FMAX;

@@ -36883,7 +37980,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

// the operands would cause it to handle comparisons between positive

// and negative zero incorrectly.

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

- if (!DAG.getTarget().Options.UnsafeFPMath &&

+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

!(DAG.isKnownNeverZeroFloat(LHS) ||

DAG.isKnownNeverZeroFloat(RHS)))

break;

@@ -36911,7 +38008,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

// Converting this to a min would handle comparisons between positive

// and negative zero incorrectly, and swapping the operands would

// cause it to handle NaNs incorrectly.

- if (!DAG.getTarget().Options.UnsafeFPMath &&

+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

!(DAG.isKnownNeverZeroFloat(LHS) ||

DAG.isKnownNeverZeroFloat(RHS))) {

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

@@ -36922,8 +38019,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

break;

case ISD::SETUGT:

// Converting this to a min would handle NaNs incorrectly.

- if (!DAG.getTarget().Options.UnsafeFPMath &&

- (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))

+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

break;

Opcode = X86ISD::FMIN;

break;

@@ -36948,7 +38044,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

// Converting this to a max would handle comparisons between positive

// and negative zero incorrectly, and swapping the operands would

// cause it to handle NaNs incorrectly.

- if (!DAG.getTarget().Options.UnsafeFPMath &&

+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

!DAG.isKnownNeverZeroFloat(LHS) &&

!DAG.isKnownNeverZeroFloat(RHS)) {

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

@@ -37093,7 +38189,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

SDValue Other;

if (ISD::isBuildVectorAllZeros(LHS.getNode())) {

Other = RHS;

- CC = ISD::getSetCCInverse(CC, true);

+ CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());

} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {

Other = LHS;

}

@@ -37165,7 +38261,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

SDValue Other;

if (ISD::isBuildVectorAllOnes(LHS.getNode())) {

Other = RHS;

- CC = ISD::getSetCCInverse(CC, true);

+ CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());

} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {

Other = LHS;

}

@@ -37788,7 +38884,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

}

/// Different mul shrinking modes.

-enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

+enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

EVT VT = N->getOperand(0).getValueType();

@@ -37809,16 +38905,16 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

// When ranges are from -128 ~ 127, use MULS8 mode.

if (MinSignBits >= 25)

- Mode = MULS8;

+ Mode = ShrinkMode::MULS8;

// When ranges are from 0 ~ 255, use MULU8 mode.

else if (AllPositive && MinSignBits >= 24)

- Mode = MULU8;

+ Mode = ShrinkMode::MULU8;

// When ranges are from -32768 ~ 32767, use MULS16 mode.

else if (MinSignBits >= 17)

- Mode = MULS16;

+ Mode = ShrinkMode::MULS16;

// When ranges are from 0 ~ 65535, use MULU16 mode.

else if (AllPositive && MinSignBits >= 16)

- Mode = MULU16;

+ Mode = ShrinkMode::MULU16;

else

return false;

return true;

@@ -37888,15 +38984,17 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

// lower part is needed.

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

- if (Mode == MULU8 || Mode == MULS8)

- return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,

+ if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

+ return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

+ : ISD::SIGN_EXTEND,

DL, VT, MulLo);

MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

// the higher part is also needed.

- SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,

- ReducedVT, NewN0, NewN1);

+ SDValue MulHi =

+ DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

+ ReducedVT, NewN0, NewN1);

// Repack the lower part and higher part result of mul into a wider

// result.

@@ -38294,7 +39392,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

// We shift all of the values by one. In many cases we do not have

// hardware support for this operation. This is better expressed as an ADD

// of two values.

- if (N1SplatC->getAPIntValue() == 1)

+ if (N1SplatC->isOne())

return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);

}

@@ -38546,15 +39644,15 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

EVT VT = N->getValueType(0);

SDValue N0 = N->getOperand(0);

- SDValue N1 = N->getOperand(1);

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&

"Unexpected value type");

- assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");

+ assert(N->getOperand(1).getValueType() == MVT::i8 &&

+ "Unexpected shift amount type");

// Out of range logical bit shifts are guaranteed to be zero.

// Out of range arithmetic bit shifts splat the sign bit.

- unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();

+ unsigned ShiftVal = N->getConstantOperandVal(1);

if (ShiftVal >= NumBitsPerElt) {

if (LogicalShift)

return DAG.getConstant(0, SDLoc(N), VT);

@@ -39094,6 +40192,71 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG,

return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);

}

+// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

+// Where C is a mask containing the same number of bits as the setcc and

+// where the setcc will freely 0 upper bits of k-register. We can replace the

+// undef in the concat with 0s and remove the AND. This mainly helps with

+// v2i1/v4i1 setcc being casted to scalar.

+static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

+ EVT VT = N->getValueType(0);

+ // Make sure this is an AND with constant. We will check the value of the

+ // constant later.

+ if (!isa<ConstantSDNode>(N->getOperand(1)))

+ return SDValue();

+ // This is implied by the ConstantSDNode.

+ assert(!VT.isVector() && "Expected scalar VT!");

+ if (N->getOperand(0).getOpcode() != ISD::BITCAST ||

+ !N->getOperand(0).hasOneUse() ||

+ !N->getOperand(0).getOperand(0).hasOneUse())

+ return SDValue();

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ SDValue Src = N->getOperand(0).getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

+ !TLI.isTypeLegal(SrcVT))

+ return SDValue();

+ if (Src.getOpcode() != ISD::CONCAT_VECTORS)

+ return SDValue();

+ // We only care about the first subvector of the concat, we expect the

+ // other subvectors to be ignored due to the AND if we make the change.

+ SDValue SubVec = Src.getOperand(0);

+ EVT SubVecVT = SubVec.getValueType();

+ // First subvector should be a setcc with a legal result type. The RHS of the

+ // AND should be a mask with this many bits.

+ if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||

+ !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))

+ return SDValue();

+ EVT SetccVT = SubVec.getOperand(0).getValueType();

+ if (!TLI.isTypeLegal(SetccVT) ||

+ !(Subtarget.hasVLX() || SetccVT.is512BitVector()))

+ return SDValue();

+ if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

+ return SDValue();

+ // We passed all the checks. Rebuild the concat_vectors with zeroes

+ // and cast it back to VT.

+ SDLoc dl(N);

+ SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

+ DAG.getConstant(0, dl, SubVecVT));

+ Ops[0] = SubVec;

+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

+ Ops);

+ return DAG.getBitcast(VT, Concat);

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

@@ -39132,9 +40295,12 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&

SrcOps.size() == 1) {

SDLoc dl(N);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

if (Mask) {

APInt AllBits = APInt::getAllOnesValue(NumElts);

return DAG.getSetCC(dl, MVT::i1, Mask,

@@ -39143,6 +40309,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

}

+ if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

+ return V;

if (DCI.isBeforeLegalizeOps())

return SDValue();

@@ -39290,68 +40459,6 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

return true;

}

-// Try to match:

-// (or (and (M, (sub 0, X)), (pandn M, X)))

-// which is a special case of vselect:

-// (vselect M, (sub 0, X), X)

-// Per:

-// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

-// We know that, if fNegate is 0 or 1:

-// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

-//

-// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

-// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

-// ( M ? -X : X) == ((X ^ M ) + (M & 1))

-// This lets us transform our vselect to:

-// (add (xor X, M), (and M, 1))

-// And further to:

-// (sub (xor X, M), M)

-static SDValue combineLogicBlendIntoConditionalNegate(

- EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

- SelectionDAG &DAG, const X86Subtarget &Subtarget) {

- EVT MaskVT = Mask.getValueType();

- assert(MaskVT.isInteger() &&

- DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&

- "Mask must be zero/all-bits");

- if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

- return SDValue();

- if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

- return SDValue();

- auto IsNegV = [](SDNode *N, SDValue V) {

- return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

- ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

- };

- SDValue V;

- if (IsNegV(Y.getNode(), X))

- V = X;

- else if (IsNegV(X.getNode(), Y))

- V = Y;

- else

- return SDValue();

- SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

- SDValue SubOp2 = Mask;

- // If the negate was on the false side of the select, then

- // the operands of the SUB need to be swapped. PR 27251.

- // This is because the pattern being matched above is

- // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

- // but if the pattern matched was

- // (vselect M, X, (sub (0, X))), that is really negation of the pattern

- // above, -(vselect M, (sub 0, X), X), and therefore the replacement

- // pattern also needs to be a negation of the replacement pattern above.

- // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

- // sub accomplishes the negation of the replacement pattern.

- if (V == Y)

- std::swap(SubOp1, SubOp2);

- SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

- return DAG.getBitcast(VT, Res);

// Try to fold:

// (or (and (m, y), (pandn m, x)))

// into:

@@ -39512,66 +40619,20 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

return Ret;

}

-static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

- TargetLowering::DAGCombinerInfo &DCI,

- const X86Subtarget &Subtarget) {

+static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

EVT VT = N->getValueType(0);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

- // If this is SSE1 only convert to FOR to avoid scalarization.

- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

- return DAG.getBitcast(MVT::v4i32,

- DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,

- DAG.getBitcast(MVT::v4f32, N0),

- DAG.getBitcast(MVT::v4f32, N1)));

- }

- // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

- // TODO: Support multiple SrcOps.

- if (VT == MVT::i1) {

- SmallVector<SDValue, 2> SrcOps;

- if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&

- SrcOps.size() == 1) {

- SDLoc dl(N);

- unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

- EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

- SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

- if (Mask) {

- APInt AllBits = APInt::getNullValue(NumElts);

- return DAG.getSetCC(dl, MVT::i1, Mask,

- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);

- }

- if (DCI.isBeforeLegalizeOps())

- return SDValue();

- if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

- return R;

- if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

- return FPLogic;

- if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

- return R;

- if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

- return R;

- // Attempt to recursively combine an OR of shuffles.

- if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

- SDValue Op(N, 0);

- if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

- return Res;

- }

- if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)

+ if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||

+ !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))

return SDValue();

// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)

- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool OptForSize = DAG.shouldOptForSize();

unsigned Bits = VT.getScalarSizeInBits();

// SHLD/SHRD instructions have lower register pressure, but on some

@@ -39589,11 +40650,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

if (!N0.hasOneUse() || !N1.hasOneUse())

return SDValue();

+ EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

SDValue ShAmt0 = N0.getOperand(1);

- if (ShAmt0.getValueType() != MVT::i8)

+ if (ShAmt0.getValueType() != ShiftVT)

return SDValue();

SDValue ShAmt1 = N1.getOperand(1);

- if (ShAmt1.getValueType() != MVT::i8)

+ if (ShAmt1.getValueType() != ShiftVT)

return SDValue();

// Peek through any modulo shift masks.

@@ -39628,12 +40691,12 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

std::swap(ShMsk0, ShMsk1);

}

- auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,

- SDValue Amt) {

+ auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,

+ SDValue Amt) {

if (Opc == ISD::FSHR)

std::swap(Op0, Op1);

return DAG.getNode(Opc, DL, VT, Op0, Op1,

- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));

+ DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));

};

// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )

@@ -39674,7 +40737,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {

if (Op1.getOpcode() == InnerShift &&

isa<ConstantSDNode>(Op1.getOperand(1)) &&

- Op1.getConstantOperandAPInt(1) == 1) {

+ Op1.getConstantOperandAPInt(1).isOneValue()) {

return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);

}

// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).

@@ -39689,6 +40752,70 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI,

+ const X86Subtarget &Subtarget) {

+ SDValue N0 = N->getOperand(0);

+ SDValue N1 = N->getOperand(1);

+ EVT VT = N->getValueType(0);

+ // If this is SSE1 only convert to FOR to avoid scalarization.

+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

+ return DAG.getBitcast(MVT::v4i32,

+ DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,

+ DAG.getBitcast(MVT::v4f32, N0),

+ DAG.getBitcast(MVT::v4f32, N1)));

+ }

+ // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

+ // TODO: Support multiple SrcOps.

+ if (VT == MVT::i1) {

+ SmallVector<SDValue, 2> SrcOps;

+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&

+ SrcOps.size() == 1) {

+ SDLoc dl(N);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

+ if (Mask) {

+ APInt AllBits = APInt::getNullValue(NumElts);

+ return DAG.getSetCC(dl, MVT::i1, Mask,

+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);

+ }

+ if (DCI.isBeforeLegalizeOps())

+ return SDValue();

+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

+ return R;

+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

+ return FPLogic;

+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

+ return R;

+ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

+ return R;

+ if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))

+ return R;

+ // Attempt to recursively combine an OR of shuffles.

+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

+ SDValue Op(N, 0);

+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

+ return Res;

+ }

+ return SDValue();

/// Try to turn tests against the signbit in the form of:

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

/// into:

@@ -39758,8 +40885,8 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

default: return SDValue();

case MVT::v16i8:

case MVT::v8i16:

- case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;

- case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;

+ case MVT::v4i32:

+ case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

case MVT::v32i8:

case MVT::v16i16:

case MVT::v8i32:

@@ -39783,7 +40910,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

// Create a greater-than comparison against -1. We don't use the more obvious

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

- return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);

+ return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

}

/// Detect patterns of truncation with unsigned saturation:

@@ -39950,7 +41077,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {

- unsigned TruncOpc;

+ unsigned TruncOpc = 0;

SDValue SatVal;

if (auto SSatVal = detectSSatPattern(In, VT)) {

SatVal = SSatVal;

@@ -40252,6 +41379,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

static SDValue

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

// However, some target hooks may need to be added to know when the transform

// is profitable. Endianness would also have to be considered.

@@ -40279,6 +41407,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

static SDValue

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

return SDValue();

@@ -40314,10 +41443,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

// The new masked load has an undef pass-through operand. The select uses the

// original pass-through operand.

- SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

- ML->getMask(), DAG.getUNDEF(VT),

- ML->getMemoryVT(), ML->getMemOperand(),

- ML->getExtensionType());

+ SDValue NewML = DAG.getMaskedLoad(

+ VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

+ DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

+ ML->getAddressingMode(), ML->getExtensionType());

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

ML->getPassThru());

@@ -40403,8 +41532,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

Mst->getMemoryVT())) {

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

- Mst->getBasePtr(), Mask,

- Mst->getMemoryVT(), Mst->getMemOperand(), true);

+ Mst->getBasePtr(), Mst->getOffset(), Mask,

+ Mst->getMemoryVT(), Mst->getMemOperand(),

+ Mst->getAddressingMode(), true);

}

return SDValue();

@@ -40593,59 +41723,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

cast<LoadSDNode>(St->getValue())->isSimple() &&

St->getChain().hasOneUse() && St->isSimple()) {

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

- SmallVector<SDValue, 8> Ops;

if (!ISD::isNormalLoad(Ld))

return SDValue();

- // If this is not the MMX case, i.e. we are just turning i64 load/store

- // into f64 load/store, avoid the transformation if there are multiple

- // uses of the loaded value.

- if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))

+ // Avoid the transformation if there are multiple uses of the loaded value.

+ if (!Ld->hasNUsesOfValue(1, 0))

return SDValue();

SDLoc LdDL(Ld);

SDLoc StDL(N);

- // If we are a 64-bit capable x86, lower to a single movq load/store pair.

- // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store

- // pair instead.

- if (Subtarget.is64Bit() || F64IsLegal) {

- MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;

- SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),

- Ld->getMemOperand());

- // Make sure new load is placed in same chain order.

- DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

- return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

- St->getMemOperand());

- }

- // Otherwise, lower to two pairs of 32-bit loads / stores.

- SDValue LoAddr = Ld->getBasePtr();

- SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

- SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,

- Ld->getPointerInfo(), Ld->getAlignment(),

- Ld->getMemOperand()->getFlags());

- SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,

- Ld->getPointerInfo().getWithOffset(4),

- MinAlign(Ld->getAlignment(), 4),

- Ld->getMemOperand()->getFlags());

- // Make sure new loads are placed in same chain order.

- DAG.makeEquivalentMemoryOrdering(Ld, LoLd);

- DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

- LoAddr = St->getBasePtr();

- HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

- SDValue LoSt =

- DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),

- St->getAlignment(), St->getMemOperand()->getFlags());

- SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,

- St->getPointerInfo().getWithOffset(4),

- MinAlign(St->getAlignment(), 4),

- St->getMemOperand()->getFlags());

- return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);

+ // Lower to a single movq load/store pair.

+ SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

+ Ld->getBasePtr(), Ld->getMemOperand());

+ // Make sure new load is placed in same chain order.

+ DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

+ return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

+ St->getMemOperand());

}

// This is similar to the above case, but here we handle a scalar 64-bit

@@ -41351,23 +42446,25 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

EVT VT = Op->getValueType(0);

- // Make sure the element size does't change.

+ // Make sure the element size doesn't change.

if (VT.getScalarSizeInBits() != ScalarSize)

return SDValue();

- if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {

+ unsigned Opc = Op.getOpcode();

+ switch (Opc) {

+ case ISD::VECTOR_SHUFFLE: {

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

- if (!SVOp->getOperand(1).isUndef())

+ if (!Op.getOperand(1).isUndef())

return SDValue();

- if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1))

+ if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

- return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),

- SVOp->getMask());

- return SDValue();

+ return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

+ cast<ShuffleVectorSDNode>(Op)->getMask());

+ break;

}

- unsigned Opc = Op.getOpcode();

- if (Opc == ISD::INSERT_VECTOR_ELT) {

+ case ISD::INSERT_VECTOR_ELT: {

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

// -V, INDEX).

SDValue InsVector = Op.getOperand(0);

@@ -41378,34 +42475,35 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

NegInsVal, Op.getOperand(2));

- return SDValue();

+ break;

}

+ case ISD::FSUB:

+ case ISD::XOR:

+ case X86ISD::FXOR: {

+ SDValue Op1 = Op.getOperand(1);

+ SDValue Op0 = Op.getOperand(0);

- if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)

- return SDValue();

- SDValue Op1 = Op.getOperand(1);

- SDValue Op0 = Op.getOperand(0);

- // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit

- // masks. For FSUB, we have to check if constant bits of Op0 are sign bit

- // masks and hence we swap the operands.

- if (Opc == ISD::FSUB)

- std::swap(Op0, Op1);

+ // For XOR and FXOR, we want to check if constant

+ // bits of Op1 are sign bit masks. For FSUB, we

+ // have to check if constant bits of Op0 are sign

+ // bit masks and hence we swap the operands.

+ if (Opc == ISD::FSUB)

+ std::swap(Op0, Op1);

- APInt UndefElts;

- SmallVector<APInt, 16> EltBits;

- // Extract constant bits and see if they are all sign bit masks. Ignore the

- // undef elements.

- if (getTargetConstantBitsFromNode(Op1, ScalarSize,

- UndefElts, EltBits,

- /* AllowWholeUndefs */ true,

- /* AllowPartialUndefs */ false)) {

- for (unsigned I = 0, E = EltBits.size(); I < E; I++)

- if (!UndefElts[I] && !EltBits[I].isSignMask())

- return SDValue();

+ APInt UndefElts;

+ SmallVector<APInt, 16> EltBits;

+ // Extract constant bits and see if they are all

+ // sign bit masks. Ignore the undef elements.

+ if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

+ /* AllowWholeUndefs */ true,

+ /* AllowPartialUndefs */ false)) {

+ for (unsigned I = 0, E = EltBits.size(); I < E; I++)

+ if (!UndefElts[I] && !EltBits[I].isSignMask())

+ return SDValue();

- return peekThroughBitcasts(Op0);

+ return peekThroughBitcasts(Op0);

+ }

}

return SDValue();

@@ -41642,8 +42740,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

return SDValue();

SDValue LHS = N->getOperand(0);

- auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));

- if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)

+ if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

return SDValue();

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

@@ -41817,8 +42914,9 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);

- // Only perform optimizations if UnsafeMath is used.

- if (!DAG.getTarget().Options.UnsafeFPMath)

+ // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

+ if (!DAG.getTarget().Options.NoNaNsFPMath ||

+ !DAG.getTarget().Options.NoSignedZerosFPMath)

return SDValue();

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

@@ -41943,6 +43041,7 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

+ // FIXME: Handle strict fp nodes.

EVT VT = N->getValueType(0);

// Convert a full vector load into vzload when not all bits are needed.

@@ -41951,7 +43050,7 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

assert(InVT.is128BitVector() && "Expected 128-bit input vector");

- LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

+ LoadSDNode *LN = cast<LoadSDNode>(In);

// Unless the load is volatile or atomic.

if (LN->isSimple()) {

SDLoc dl(N);

@@ -42569,6 +43668,44 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

+/// recognizable memcmp expansion.

+static bool isOrXorXorTree(SDValue X, bool Root = true) {

+ if (X.getOpcode() == ISD::OR)

+ return isOrXorXorTree(X.getOperand(0), false) &&

+ isOrXorXorTree(X.getOperand(1), false);

+ if (Root)

+ return false;

+ return X.getOpcode() == ISD::XOR;

+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

+/// expansion.

+template<typename F>

+static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,

+ EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

+ SDValue Op0 = X.getOperand(0);

+ SDValue Op1 = X.getOperand(1);

+ if (X.getOpcode() == ISD::OR) {

+ SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

+ SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

+ if (VecVT != CmpVT)

+ return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

+ if (HasPT)

+ return DAG.getNode(ISD::OR, DL, VecVT, A, B);

+ return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

+ } else if (X.getOpcode() == ISD::XOR) {

+ SDValue A = SToV(Op0);

+ SDValue B = SToV(Op1);

+ if (VecVT != CmpVT)

+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

+ if (HasPT)

+ return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

+ }

+ llvm_unreachable("Impossible");

/// Try to map a 128-bit or larger integer comparison to vector instructions

/// before type legalization splits it up into chunks.

static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

@@ -42589,10 +43726,8 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

// logically-combined vector-sized operands compared to zero. This pattern may

// be generated by the memcmp expansion pass with oversized integer compares

// (see PR33325).

- bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&

- X.getOperand(0).getOpcode() == ISD::XOR &&

- X.getOperand(1).getOpcode() == ISD::XOR;

- if (isNullConstant(Y) && !IsOrXorXorCCZero)

+ bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

+ if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

return SDValue();

// Don't perform this combine if constructing the vector will be expensive.

@@ -42602,66 +43737,102 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

X.getOpcode() == ISD::LOAD;

};

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

- !IsOrXorXorCCZero)

+ !IsOrXorXorTreeCCZero)

return SDValue();

EVT VT = SetCC->getValueType(0);

SDLoc DL(SetCC);

bool HasAVX = Subtarget.hasAVX();

- // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512.

+ // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

+ // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

// Otherwise use PCMPEQ (plus AND) and mask testing.

if ((OpSize == 128 && Subtarget.hasSSE2()) ||

(OpSize == 256 && HasAVX) ||

(OpSize == 512 && Subtarget.useAVX512Regs())) {

bool HasPT = Subtarget.hasSSE41();

+ // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

+ // vector registers are essentially free. (Technically, widening registers

+ // prevents load folding, but the tradeoff is worth it.)

+ bool PreferKOT = Subtarget.preferMaskRegisters();

+ bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

EVT VecVT = MVT::v16i8;

- EVT CmpVT = MVT::v16i8;

- if (OpSize == 256)

- VecVT = CmpVT = MVT::v32i8;

- if (OpSize == 512) {

+ EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

+ if (OpSize == 256) {

+ VecVT = MVT::v32i8;

+ CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

+ }

+ EVT CastVT = VecVT;

+ bool NeedsAVX512FCast = false;

+ if (OpSize == 512 || NeedZExt) {

if (Subtarget.hasBWI()) {

VecVT = MVT::v64i8;

CmpVT = MVT::v64i1;

+ if (OpSize == 512)

+ CastVT = VecVT;

} else {

VecVT = MVT::v16i32;

CmpVT = MVT::v16i1;

+ CastVT = OpSize == 512 ? VecVT :

+ OpSize == 256 ? MVT::v8i32 : MVT::v4i32;

+ NeedsAVX512FCast = true;

+ }

+ auto ScalarToVector = [&](SDValue X) -> SDValue {

+ bool TmpZext = false;

+ EVT TmpCastVT = CastVT;

+ if (X.getOpcode() == ISD::ZERO_EXTEND) {

+ SDValue OrigX = X.getOperand(0);

+ unsigned OrigSize = OrigX.getScalarValueSizeInBits();

+ if (OrigSize < OpSize) {

+ if (OrigSize == 128) {

+ TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

+ X = OrigX;

+ TmpZext = true;

+ } else if (OrigSize == 256) {

+ TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

+ X = OrigX;

+ TmpZext = true;

+ }

}

- }

+ X = DAG.getBitcast(TmpCastVT, X);

+ if (!NeedZExt && !TmpZext)

+ return X;

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());

+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

+ DAG.getConstant(0, DL, VecVT), X,

+ DAG.getConstant(0, DL, VecIdxVT));

+ };

SDValue Cmp;

- if (IsOrXorXorCCZero) {

+ if (IsOrXorXorTreeCCZero) {

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

// Use 2 vector equality compares and 'and' the results before doing a

// MOVMSK.

- SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));

- SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));

- SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));

- SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));

- if (VecVT == CmpVT && HasPT) {

- SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B);

- SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D);

- Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2);

- } else {

- SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

- SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);

- Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);

- }

+ Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

} else {

- SDValue VecX = DAG.getBitcast(VecVT, X);

- SDValue VecY = DAG.getBitcast(VecVT, Y);

- if (VecVT == CmpVT && HasPT) {

+ SDValue VecX = ScalarToVector(X);

+ SDValue VecY = ScalarToVector(Y);

+ if (VecVT != CmpVT) {

+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

+ } else if (HasPT) {

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

} else {

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

}

- // For 512-bits we want to emit a setcc that will lower to kortest.

+ // AVX512 should emit a setcc that will lower to kortest.

if (VecVT != CmpVT) {

- EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16;

- SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT);

- return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC);

+ EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :

+ CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;

+ return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

+ DAG.getConstant(0, DL, KRegVT), CC);

}

if (HasPT) {

SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,

@@ -42687,9 +43858,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

- SDValue LHS = N->getOperand(0);

- SDValue RHS = N->getOperand(1);

+ const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

+ const SDValue LHS = N->getOperand(0);

+ const SDValue RHS = N->getOperand(1);

EVT VT = N->getValueType(0);

EVT OpVT = LHS.getValueType();

SDLoc DL(N);

@@ -42716,30 +43887,35 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

- // Put build_vectors on the right.

- if (LHS.getOpcode() == ISD::BUILD_VECTOR) {

- std::swap(LHS, RHS);

- CC = ISD::getSetCCSwappedOperands(CC);

+ // Using temporaries to avoid messing up operand ordering for later

+ // transformations if this doesn't work.

+ SDValue Op0 = LHS;

+ SDValue Op1 = RHS;

+ ISD::CondCode TmpCC = CC;

+ // Put build_vector on the right.

+ if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

+ std::swap(Op0, Op1);

+ TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

}

bool IsSEXT0 =

- (LHS.getOpcode() == ISD::SIGN_EXTEND) &&

- (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

- bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

+ (Op0.getOpcode() == ISD::SIGN_EXTEND) &&

+ (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

+ bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

if (IsSEXT0 && IsVZero1) {

- assert(VT == LHS.getOperand(0).getValueType() &&

+ assert(VT == Op0.getOperand(0).getValueType() &&

"Uexpected operand type");

- if (CC == ISD::SETGT)

+ if (TmpCC == ISD::SETGT)

return DAG.getConstant(0, DL, VT);

- if (CC == ISD::SETLE)

+ if (TmpCC == ISD::SETLE)

return DAG.getConstant(1, DL, VT);

- if (CC == ISD::SETEQ || CC == ISD::SETGE)

- return DAG.getNOT(DL, LHS.getOperand(0), VT);

+ if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

+ return DAG.getNOT(DL, Op0.getOperand(0), VT);

- assert((CC == ISD::SETNE || CC == ISD::SETLT) &&

+ assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&

"Unexpected condition code!");

- return LHS.getOperand(0);

+ return Op0.getOperand(0);

}

@@ -42752,8 +43928,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

VT.getVectorElementType() == MVT::i1 &&

(OpVT.getVectorElementType() == MVT::i8 ||

OpVT.getVectorElementType() == MVT::i16)) {

- SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,

- N->getOperand(2));

+ SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

}

@@ -42985,16 +44160,18 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

// unary operation isn't a bitwise AND, or if the sizes of the operations

// aren't the same.

EVT VT = N->getValueType(0);

- if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||

- N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||

- VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())

+ bool IsStrict = N->isStrictFPOpcode();

+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

+ if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||

+ Op0->getOperand(0)->getOpcode() != ISD::SETCC ||

+ VT.getSizeInBits() != Op0.getValueSizeInBits())

return SDValue();

// Now check that the other operand of the AND is a constant. We could

// make the transformation for non-constant splats as well, but it's unclear

// that would be a benefit as it would not eliminate any operations, just

// perform one more step in scalar code before moving to the vector unit.

- if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {

+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

// Bail out if the vector isn't a constant.

if (!BV->isConstant())

return SDValue();

@@ -43004,12 +44181,19 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

EVT IntVT = BV->getValueType(0);

// Create a new constant of the appropriate type for the transformed

// DAG.

- SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

+ SDValue SourceConst;

+ if (IsStrict)

+ SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

+ {N->getOperand(0), SDValue(BV, 0)});

+ else

+ SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

// The AND node needs bitcasts to/from an integer vector type around it.

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

- SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,

- N->getOperand(0)->getOperand(0), MaskConst);

+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

+ MaskConst);

SDValue Res = DAG.getBitcast(VT, NewAnd);

+ if (IsStrict)

+ return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

return Res;

}

@@ -43053,7 +44237,8 @@ static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

- SDValue Op0 = N->getOperand(0);

+ bool IsStrict = N->isStrictFPOpcode();

+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

EVT VT = N->getValueType(0);

EVT InVT = Op0.getValueType();

@@ -43067,14 +44252,21 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

+ {N->getOperand(0), P});

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

}

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

// the optimization here.

- if (DAG.SignBitIsZero(Op0))

+ if (DAG.SignBitIsZero(Op0)) {

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

+ {N->getOperand(0), Op0});

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

+ }

return SDValue();

}

@@ -43084,11 +44276,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

// First try to optimize away the conversion entirely when it's

// conditionally from a constant. Vectors only.

+ bool IsStrict = N->isStrictFPOpcode();

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

return Res;

// Now move on to more general possibilities.

- SDValue Op0 = N->getOperand(0);

+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

EVT VT = N->getValueType(0);

EVT InVT = Op0.getValueType();

@@ -43100,6 +44293,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

InVT.getVectorNumElements());

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

+ {N->getOperand(0), P});

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

}

@@ -43117,6 +44313,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

SDLoc dl(N);

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

+ if (IsStrict)

+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

+ {N->getOperand(0), Trunc});

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

}

// If we're after legalize and the type is v2i32 we need to shuffle and

@@ -43125,6 +44324,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

{ 0, 2, -1, -1 });

+ if (IsStrict)

+ return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

+ {N->getOperand(0), Shuf});

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

}

@@ -43148,13 +44350,16 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

if (Ld->isSimple() && !VT.isVector() &&

ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&

!Subtarget.is64Bit() && LdVT == MVT::i64) {

- SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(

+ std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(

SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);

- DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));

- return FILDChain;

+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

+ return Tmp.first;

}

+ if (IsStrict)

+ return SDValue();

if (SDValue V = combineToFPTruncExtElt(N, DAG))

return V;

@@ -43579,7 +44784,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,

auto UsePMADDWD = [&](SDValue Op) {

ShrinkMode Mode;

return Op.getOpcode() == ISD::MUL &&

- canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 &&

+ canReduceVMulWidth(Op.getNode(), DAG, Mode) &&

+ Mode != ShrinkMode::MULU16 &&

(!Subtarget.hasSSE41() ||

(Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&

Op->isOnlyUserOf(Op.getOperand(1).getNode())));

@@ -43784,7 +44990,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

// Check if the Mul source can be safely shrunk.

ShrinkMode Mode;

- if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)

+ if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

+ Mode == ShrinkMode::MULU16)

return SDValue();

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

@@ -44468,7 +45675,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

SDValue InVec = N->getOperand(0);

SDValue InVecBC = peekThroughBitcasts(InVec);

EVT InVecVT = InVec.getValueType();

- EVT InVecBCVT = InVecBC.getValueType();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

@@ -44512,31 +45718,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

VT, SDLoc(N),

InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

- // Try to move vector bitcast after extract_subv by scaling extraction index:

- // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')

- // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR

- if (InVec != InVecBC && InVecBCVT.isVector()) {

- unsigned SrcNumElts = InVecBCVT.getVectorNumElements();

- unsigned DestNumElts = InVecVT.getVectorNumElements();

- if ((DestNumElts % SrcNumElts) == 0) {

- unsigned DestSrcRatio = DestNumElts / SrcNumElts;

- if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {

- unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;

- EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),

- InVecBCVT.getScalarType(), NewExtNumElts);

- if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&

- TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {

- unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;

- SDLoc DL(N);

- SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);

- SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,

- InVecBC, NewIndex);

- return DAG.getBitcast(VT, NewExtract);

- }

// If we are extracting from an insert into a zero vector, replace with a

// smaller insert into zero if we don't access less than the original

// subvector. Don't do this for i1 vectors.

@@ -44583,7 +45764,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

}

// v2f64 CVTUDQ2PD(v4i32).

- if (InOpcode == ISD::UINT_TO_FP &&

+ if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

InVec.getOperand(0).getValueType() == MVT::v4i32) {

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

}

@@ -44751,6 +45932,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

EVT VT = N->getValueType(0);

+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

+ return DAG.getConstant(0, SDLoc(N), VT);

APInt KnownUndef, KnownZero;

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());

@@ -44802,8 +45986,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

- case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget);

- case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);

+ case ISD::SINT_TO_FP:

+ case ISD::STRICT_SINT_TO_FP:

+ return combineSIntToFP(N, DAG, DCI, Subtarget);

+ case ISD::UINT_TO_FP:

+ case ISD::STRICT_UINT_TO_FP:

+ return combineUIntToFP(N, DAG, Subtarget);

case ISD::FADD:

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

case ISD::FNEG: return combineFneg(N, DAG, Subtarget);