src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
commit	cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree	209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/X86/X86ISelLowering.cpp
parent	706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
download	src-cfca06d7963fa0909f90483b42a6d7d194d01e08.tar.gz src-cfca06d7963fa0909f90483b42a6d7d194d01e08.zip

Vendor import of llvm-project master 2e10b7a39b9, the last commit beforevendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9 vendor/llvm-project/master

the llvmorg-12-init tag, from which release/11.x was branched.

Notes

Notes: svn path=/vendor/llvm-project/master/; revision=363578 svn path=/vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9/; revision=363579; tag=vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9

Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')

-rw-r--r--

llvm/lib/Target/X86/X86ISelLowering.cpp

10153

1 files changed, 6524 insertions, 3629 deletions

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0f152968ddfd..450927aaf5cc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -12,7 +12,8 @@

//===----------------------------------------------------------------------===//

#include "X86ISelLowering.h"

-#include "Utils/X86ShuffleDecode.h"

+#include "MCTargetDesc/X86ShuffleDecode.h"

+#include "X86.h"

#include "X86CallingConv.h"

#include "X86FrameLowering.h"

#include "X86InstrBuilder.h"

@@ -28,6 +29,7 @@

#include "llvm/Analysis/BlockFrequencyInfo.h"

#include "llvm/Analysis/EHPersonalities.h"

#include "llvm/Analysis/ProfileSummaryInfo.h"

+#include "llvm/Analysis/VectorUtils.h"

#include "llvm/CodeGen/IntrinsicLowering.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

@@ -37,7 +39,6 @@

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/WinEHFuncInfo.h"

-#include "llvm/IR/CallSite.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DerivedTypes.h"

@@ -75,13 +76,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(

" of the loop header PC will be 0)."),

cl::Hidden);

-// Added in 10.0.

-static cl::opt<bool> EnableOldKNLABI(

- "x86-enable-old-knl-abi", cl::init(false),

- cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "

- "one ZMM register on AVX512F, but not AVX512BW targets."),

- cl::Hidden);

static cl::opt<bool> MulConstantOptimization(

"mul-constant-optimization", cl::init(true),

cl::desc("Replace 'mul x, Const' with more effective instructions like "

@@ -164,7 +158,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

- // FIXME: Should we be limitting the atomic size on other configs? Default is

+ // FIXME: Should we be limiting the atomic size on other configs? Default is

// 1024.

if (!Subtarget.hasCmpxchg8b())

setMaxAtomicSizeInBitsSupported(32);

@@ -190,12 +184,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

// SETOEQ and SETUNE require checking two conditions.

- setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);

- setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);

- setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);

- setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);

- setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);

- setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

+ for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

+ setCondCodeAction(ISD::SETOEQ, VT, Expand);

+ setCondCodeAction(ISD::SETUNE, VT, Expand);

+ }

// Integer absolute.

if (Subtarget.hasCMov()) {

@@ -206,10 +198,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// Funnel shifts.

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

+ // For slow shld targets we only lower for code size.

+ LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

+ setOperationAction(ShiftOp , MVT::i8 , Custom);

setOperationAction(ShiftOp , MVT::i16 , Custom);

- setOperationAction(ShiftOp , MVT::i32 , Custom);

+ setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

if (Subtarget.is64Bit())

- setOperationAction(ShiftOp , MVT::i64 , Custom);

+ setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

}

if (!Subtarget.useSoftFloat()) {

@@ -270,6 +266,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

+ setOperationAction(ISD::LRINT, MVT::f32, Custom);

+ setOperationAction(ISD::LRINT, MVT::f64, Custom);

+ setOperationAction(ISD::LLRINT, MVT::f32, Custom);

+ setOperationAction(ISD::LLRINT, MVT::f64, Custom);

+ if (!Subtarget.is64Bit()) {

+ setOperationAction(ISD::LRINT, MVT::i64, Custom);

+ setOperationAction(ISD::LLRINT, MVT::i64, Custom);

+ }

}

// Handle address space casts between mixed sized pointers.

@@ -347,34 +353,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

} else {

- setOperationAction(ISD::CTLZ , MVT::i8 , Custom);

- setOperationAction(ISD::CTLZ , MVT::i16 , Custom);

- setOperationAction(ISD::CTLZ , MVT::i32 , Custom);

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);

- if (Subtarget.is64Bit()) {

- setOperationAction(ISD::CTLZ , MVT::i64 , Custom);

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);

+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

+ if (VT == MVT::i64 && !Subtarget.is64Bit())

+ continue;

+ setOperationAction(ISD::CTLZ , VT, Custom);

+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

}

- // Special handling for half-precision floating point conversions.

- // If we don't have F16C support, then lower half float conversions

- // into library calls.

- if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {

- setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);

- setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);

+ for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

+ ISD::STRICT_FP_TO_FP16}) {

+ // Special handling for half-precision floating point conversions.

+ // If we don't have F16C support, then lower half float conversions

+ // into library calls.

+ setOperationAction(

+ Op, MVT::f32,

+ (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

+ // There's never any support for operations beyond MVT::f32.

+ setOperationAction(Op, MVT::f64, Expand);

+ setOperationAction(Op, MVT::f80, Expand);

+ setOperationAction(Op, MVT::f128, Expand);

}

- // There's never any support for operations beyond MVT::f32.

- setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);

- setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);

- setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);

- setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);

- setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

- setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);

setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);

setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);

@@ -542,7 +542,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

- } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {

+ } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&

+ (UseX87 || Is64Bit)) {

// Use SSE for f32, x87 for f64.

// Set up the FP register classes.

addRegisterClass(MVT::f32, &X86::FR32RegClass);

@@ -663,8 +664,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::FMA, MVT::f80, Expand);

setOperationAction(ISD::LROUND, MVT::f80, Expand);

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

- setOperationAction(ISD::LRINT, MVT::f80, Expand);

- setOperationAction(ISD::LLRINT, MVT::f80, Expand);

+ setOperationAction(ISD::LRINT, MVT::f80, Custom);

+ setOperationAction(ISD::LLRINT, MVT::f80, Custom);

// Handle constrained floating-point operations of scalar.

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

@@ -1038,8 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::ROTL, MVT::v4i32, Custom);

setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

- // With AVX512, expanding (and promoting the shifts) is better.

- if (!Subtarget.hasAVX512())

+ // With 512-bit registers or AVX512VL+BW, expanding (and promoting the

+ // shifts) is better.

+ if (!Subtarget.useAVX512Regs() &&

+ !(Subtarget.hasBWI() && Subtarget.hasVLX()))

setOperationAction(ISD::ROTL, MVT::v16i8, Custom);

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

@@ -1078,6 +1081,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

+ setOperationAction(ISD::FROUND, RoundedTy, Custom);

}

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

@@ -1170,6 +1175,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

setOperationAction(ISD::FNEARBYINT, VT, Legal);

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

+ setOperationAction(ISD::FROUND, VT, Custom);

setOperationAction(ISD::FNEG, VT, Custom);

setOperationAction(ISD::FABS, VT, Custom);

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

@@ -1221,7 +1229,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

// With BWI, expanding (and promoting the shifts) is the better.

- if (!Subtarget.hasBWI())

+ if (!Subtarget.useBWIRegs())

setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

@@ -1412,19 +1420,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

}

- for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

setOperationAction(ISD::ADD, VT, Custom);

setOperationAction(ISD::SUB, VT, Custom);

setOperationAction(ISD::MUL, VT, Custom);

+ setOperationAction(ISD::UADDSAT, VT, Custom);

+ setOperationAction(ISD::SADDSAT, VT, Custom);

+ setOperationAction(ISD::USUBSAT, VT, Custom);

+ setOperationAction(ISD::SSUBSAT, VT, Custom);

+ setOperationAction(ISD::VSELECT, VT, Expand);

+ }

+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

setOperationAction(ISD::SETCC, VT, Custom);

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

setOperationAction(ISD::SELECT, VT, Custom);

setOperationAction(ISD::TRUNCATE, VT, Custom);

- setOperationAction(ISD::UADDSAT, VT, Custom);

- setOperationAction(ISD::SADDSAT, VT, Custom);

- setOperationAction(ISD::USUBSAT, VT, Custom);

- setOperationAction(ISD::SSUBSAT, VT, Custom);

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

@@ -1432,7 +1444,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

- setOperationAction(ISD::VSELECT, VT, Expand);

}

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

@@ -1443,10 +1454,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// elements. 512-bits can be disabled based on prefer-vector-width and

// required-vector-width function attributes.

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

+ bool HasBWI = Subtarget.hasBWI();

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

@@ -1454,6 +1469,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

+ if (HasBWI)

+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

}

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

@@ -1497,6 +1514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

+ if (HasBWI)

+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

// to 512-bit rather than use the AVX2 instructions so that we can use

@@ -1509,19 +1528,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

}

- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);

- setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);

- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

- setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

- setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

- // Need to custom widen this if we don't have AVX512BW.

- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);

- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);

- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);

+ if (HasBWI) {

+ // Extends from v64i1 masks to 512-bit vectors.

+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

+ setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

+ }

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

setOperationAction(ISD::FFLOOR, VT, Legal);

@@ -1535,47 +1561,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::FNEARBYINT, VT, Legal);

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

- setOperationAction(ISD::SELECT, VT, Custom);

+ setOperationAction(ISD::FROUND, VT, Custom);

}

- // Without BWI we need to use custom lowering to handle MVT::v64i8 input.

- for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {

+ for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

}

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

+ setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);

+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);

+ setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);

- setOperationAction(ISD::MUL, MVT::v8i64, Custom);

- setOperationAction(ISD::MUL, MVT::v16i32, Legal);

+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

+ setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

- setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

- setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

+ setOperationAction(ISD::SRL, VT, Custom);

+ setOperationAction(ISD::SHL, VT, Custom);

+ setOperationAction(ISD::SRA, VT, Custom);

+ setOperationAction(ISD::SETCC, VT, Custom);

+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use

+ // setcc all the way to isel and prefer SETGT in some isel patterns.

+ setCondCodeAction(ISD::SETLT, VT, Custom);

+ setCondCodeAction(ISD::SETLE, VT, Custom);

+ }

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

setOperationAction(ISD::SMAX, VT, Legal);

setOperationAction(ISD::UMAX, VT, Legal);

setOperationAction(ISD::SMIN, VT, Legal);

setOperationAction(ISD::UMIN, VT, Legal);

setOperationAction(ISD::ABS, VT, Legal);

- setOperationAction(ISD::SRL, VT, Custom);

- setOperationAction(ISD::SHL, VT, Custom);

- setOperationAction(ISD::SRA, VT, Custom);

setOperationAction(ISD::CTPOP, VT, Custom);

setOperationAction(ISD::ROTL, VT, Custom);

setOperationAction(ISD::ROTR, VT, Custom);

- setOperationAction(ISD::SETCC, VT, Custom);

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

- setOperationAction(ISD::SELECT, VT, Custom);

+ }

- // The condition codes aren't legal in SSE/AVX and under AVX512 we use

- // setcc all the way to isel and prefer SETGT in some isel patterns.

- setCondCodeAction(ISD::SETLT, VT, Custom);

- setCondCodeAction(ISD::SETLE, VT, Custom);

+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

+ setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

+ setOperationAction(ISD::CTLZ, VT, Custom);

+ setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

+ setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

}

if (Subtarget.hasDQI()) {

@@ -1610,36 +1658,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

MVT::v8f32, MVT::v4f64 })

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

+ MVT::v16f32, MVT::v8f64 }) {

+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

+ setOperationAction(ISD::SELECT, VT, Custom);

+ setOperationAction(ISD::VSELECT, VT, Custom);

+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

+ }

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

- setOperationAction(ISD::VSELECT, VT, Custom);

- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

setOperationAction(ISD::MLOAD, VT, Legal);

setOperationAction(ISD::MSTORE, VT, Legal);

setOperationAction(ISD::MGATHER, VT, Custom);

setOperationAction(ISD::MSCATTER, VT, Custom);

}

- if (!Subtarget.hasBWI()) {

- // Need to custom split v32i16/v64i8 bitcasts.

- setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);

- setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);

- // Better to split these into two 256-bit ops.

- setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);

- setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);

+ if (HasBWI) {

+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

+ setOperationAction(ISD::MLOAD, VT, Legal);

+ setOperationAction(ISD::MSTORE, VT, Legal);

+ }

+ } else {

+ setOperationAction(ISD::STORE, MVT::v32i16, Custom);

+ setOperationAction(ISD::STORE, MVT::v64i8, Custom);

}

if (Subtarget.hasVBMI2()) {

- for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

+ for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

setOperationAction(ISD::FSHL, VT, Custom);

setOperationAction(ISD::FSHR, VT, Custom);

}

- }// has AVX-512

+ }// useAVX512Regs

// This block controls legalization for operations that don't have

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

@@ -1667,6 +1721,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,

Subtarget.hasVLX() ? Legal : Custom);

+ if (Subtarget.hasDQI()) {

+ // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

+ // v2f32 UINT_TO_FP is already custom under SSE2.

+ assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&

+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&

+ "Unexpected operation action!");

+ // v2i64 FP_TO_S/UINT(v2f32) custom conversion.

+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

+ }

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

setOperationAction(ISD::SMAX, VT, Legal);

setOperationAction(ISD::UMAX, VT, Legal);

@@ -1746,12 +1813,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::SELECT, VT, Custom);

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

}

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);

- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);

- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);

for (auto VT : { MVT::v16i1, MVT::v32i1 })

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

@@ -1759,93 +1824,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

- }

- // This block controls legalization for v32i16 and v64i8. 512-bits can be

- // disabled based on prefer-vector-width and required-vector-width function

- // attributes.

- if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {

- addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

- addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

- // Extends from v64i1 masks to 512-bit vectors.

- setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

- setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

- setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

- setOperationAction(ISD::MUL, MVT::v32i16, Legal);

- setOperationAction(ISD::MUL, MVT::v64i8, Custom);

- setOperationAction(ISD::MULHS, MVT::v32i16, Legal);

- setOperationAction(ISD::MULHU, MVT::v32i16, Legal);

- setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

- setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);

- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);

- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);

- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);

- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);

- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);

- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);

- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);

- setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

- setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

- setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);

- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);

- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);

- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);

- setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);

- setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

- setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

- setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

- for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

- setOperationAction(ISD::VSELECT, VT, Custom);

- setOperationAction(ISD::ABS, VT, Legal);

- setOperationAction(ISD::SRL, VT, Custom);

- setOperationAction(ISD::SHL, VT, Custom);

- setOperationAction(ISD::SRA, VT, Custom);

- setOperationAction(ISD::MLOAD, VT, Legal);

- setOperationAction(ISD::MSTORE, VT, Legal);

- setOperationAction(ISD::CTPOP, VT, Custom);

- setOperationAction(ISD::CTLZ, VT, Custom);

- setOperationAction(ISD::SMAX, VT, Legal);

- setOperationAction(ISD::UMAX, VT, Legal);

- setOperationAction(ISD::SMIN, VT, Legal);

- setOperationAction(ISD::UMIN, VT, Legal);

- setOperationAction(ISD::SETCC, VT, Custom);

- setOperationAction(ISD::UADDSAT, VT, Legal);

- setOperationAction(ISD::SADDSAT, VT, Legal);

- setOperationAction(ISD::USUBSAT, VT, Legal);

- setOperationAction(ISD::SSUBSAT, VT, Legal);

- setOperationAction(ISD::SELECT, VT, Custom);

- // The condition codes aren't legal in SSE/AVX and under AVX512 we use

- // setcc all the way to isel and prefer SETGT in some isel patterns.

- setCondCodeAction(ISD::SETLT, VT, Custom);

- setCondCodeAction(ISD::SETLE, VT, Custom);

- }

- for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

- setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

- }

- if (Subtarget.hasBITALG()) {

- for (auto VT : { MVT::v64i8, MVT::v32i16 })

- setOperationAction(ISD::CTPOP, VT, Legal);

- }

- if (Subtarget.hasVBMI2()) {

- setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

- setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

- }

- if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

@@ -1874,19 +1853,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

- if (Subtarget.hasDQI()) {

- // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

- // v2f32 UINT_TO_FP is already custom under SSE2.

- assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&

- isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&

- "Unexpected operation action!");

- // v2i64 FP_TO_S/UINT(v2f32) custom conversion.

- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

- }

if (Subtarget.hasBWI()) {

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

@@ -1983,6 +1949,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// We have target-specific dag combine patterns for the following nodes:

setTargetDAGCombine(ISD::VECTOR_SHUFFLE);

setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);

+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

setTargetDAGCombine(ISD::CONCAT_VECTORS);

setTargetDAGCombine(ISD::INSERT_SUBVECTOR);

@@ -2000,6 +1967,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setTargetDAGCombine(ISD::FSUB);

setTargetDAGCombine(ISD::FNEG);

setTargetDAGCombine(ISD::FMA);

+ setTargetDAGCombine(ISD::STRICT_FMA);

setTargetDAGCombine(ISD::FMINNUM);

setTargetDAGCombine(ISD::FMAXNUM);

setTargetDAGCombine(ISD::SUB);

@@ -2024,6 +1992,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setTargetDAGCombine(ISD::XOR);

setTargetDAGCombine(ISD::MSCATTER);

setTargetDAGCombine(ISD::MGATHER);

+ setTargetDAGCombine(ISD::FP16_TO_FP);

+ setTargetDAGCombine(ISD::FP_EXTEND);

+ setTargetDAGCombine(ISD::STRICT_FP_EXTEND);

+ setTargetDAGCombine(ISD::FP_ROUND);

computeRegisterProperties(Subtarget.getRegisterInfo());

@@ -2075,7 +2047,8 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

TargetLoweringBase::LegalizeTypeAction

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())

+ if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

+ !Subtarget.hasBWI())

return TypeSplitVector;

if (VT.getVectorNumElements() != 1 &&

@@ -2085,51 +2058,73 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {

return TargetLoweringBase::getPreferredVectorAction(VT);

}

+static std::pair<MVT, unsigned>

+handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

+ const X86Subtarget &Subtarget) {

+ // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

+ // convention is one that uses k registers.

+ if (NumElts == 2)

+ return {MVT::v2i64, 1};

+ if (NumElts == 4)

+ return {MVT::v4i32, 1};

+ if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

+ CC != CallingConv::Intel_OCL_BI)

+ return {MVT::v8i16, 1};

+ if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

+ CC != CallingConv::Intel_OCL_BI)

+ return {MVT::v16i8, 1};

+ // v32i1 passes in ymm unless we have BWI and the calling convention is

+ // regcall.

+ if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

+ return {MVT::v32i8, 1};

+ // Split v64i1 vectors if we don't have v64i8 available.

+ if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

+ if (Subtarget.useAVX512Regs())

+ return {MVT::v64i8, 1};

+ return {MVT::v32i8, 2};

+ }

+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

+ if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

+ NumElts > 64)

+ return {MVT::i8, NumElts};

+ return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

CallingConv::ID CC,

EVT VT) const {

- // v32i1 vectors should be promoted to v32i8 to match avx2.

- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())

- return MVT::v32i8;

- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

- Subtarget.hasAVX512() &&

- (!isPowerOf2_32(VT.getVectorNumElements()) ||

- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||

- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))

- return MVT::i8;

- // Split v64i1 vectors if we don't have v64i8 available.

- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

- CC != CallingConv::X86_RegCall)

- return MVT::v32i1;

- // FIXME: Should we just make these types legal and custom split operations?

- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&

- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())

- return MVT::v16i32;

+ Subtarget.hasAVX512()) {

+ unsigned NumElts = VT.getVectorNumElements();

+ MVT RegisterVT;

+ unsigned NumRegisters;

+ std::tie(RegisterVT, NumRegisters) =

+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

+ return RegisterVT;

+ }

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

}

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

CallingConv::ID CC,

EVT VT) const {

- // v32i1 vectors should be promoted to v32i8 to match avx2.

- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())

- return 1;

- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

- Subtarget.hasAVX512() &&

- (!isPowerOf2_32(VT.getVectorNumElements()) ||

- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||

- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))

- return VT.getVectorNumElements();

- // Split v64i1 vectors if we don't have v64i8 available.

- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

- CC != CallingConv::X86_RegCall)

- return 2;

- // FIXME: Should we just make these types legal and custom split operations?

- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&

- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())

- return 1;

+ Subtarget.hasAVX512()) {

+ unsigned NumElts = VT.getVectorNumElements();

+ MVT RegisterVT;

+ unsigned NumRegisters;

+ std::tie(RegisterVT, NumRegisters) =

+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

+ return NumRegisters;

+ }

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

}

@@ -2140,8 +2135,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

Subtarget.hasAVX512() &&

(!isPowerOf2_32(VT.getVectorNumElements()) ||

- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||

- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {

+ (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

+ VT.getVectorNumElements() > 64)) {

RegisterVT = MVT::i8;

IntermediateVT = MVT::i1;

NumIntermediates = VT.getVectorNumElements();

@@ -2151,7 +2146,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

// Split v64i1 vectors if we don't have v64i8 available.

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

CC != CallingConv::X86_RegCall) {

- RegisterVT = MVT::v32i1;

+ RegisterVT = MVT::v32i8;

IntermediateVT = MVT::v32i1;

NumIntermediates = 2;

return 2;

@@ -2194,20 +2189,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

/// Helper for getByValTypeAlignment to determine

/// the desired ByVal argument alignment.

-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {

+static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

if (MaxAlign == 16)

return;

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

- if (VTy->getBitWidth() == 128)

- MaxAlign = 16;

+ if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)

+ MaxAlign = Align(16);

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

- unsigned EltAlign = 0;

+ Align EltAlign;

getMaxByValAlign(ATy->getElementType(), EltAlign);

if (EltAlign > MaxAlign)

MaxAlign = EltAlign;

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

for (auto *EltTy : STy->elements()) {

- unsigned EltAlign = 0;

+ Align EltAlign;

getMaxByValAlign(EltTy, EltAlign);

if (EltAlign > MaxAlign)

MaxAlign = EltAlign;

@@ -2225,46 +2220,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,

const DataLayout &DL) const {

if (Subtarget.is64Bit()) {

// Max of 8 and alignment of type.

- unsigned TyAlign = DL.getABITypeAlignment(Ty);

+ Align TyAlign = DL.getABITypeAlign(Ty);

if (TyAlign > 8)

- return TyAlign;

+ return TyAlign.value();

return 8;

}

- unsigned Align = 4;

+ Align Alignment(4);

if (Subtarget.hasSSE1())

- getMaxByValAlign(Ty, Align);

- return Align;

-/// Returns the target specific optimal type for load

-/// and store operations as a result of memset, memcpy, and memmove

-/// lowering. If DstAlign is zero that means it's safe to destination

-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it

-/// means there isn't a need to check it against alignment requirement,

-/// probably because the source does not need to be loaded. If 'IsMemset' is

-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that

-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy

-/// source is constant so it does not need to be loaded.

+ getMaxByValAlign(Ty, Alignment);

+ return Alignment.value();

/// It returns EVT::Other if the type should be determined using generic

/// target-independent logic.

/// For vector ops we check that the overall size isn't larger than our

/// preferred vector width.

EVT X86TargetLowering::getOptimalMemOpType(

- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,

- bool ZeroMemset, bool MemcpyStrSrc,

- const AttributeList &FuncAttributes) const {

+ const MemOp &Op, const AttributeList &FuncAttributes) const {

if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {

- if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||

- ((DstAlign == 0 || DstAlign >= 16) &&

- (SrcAlign == 0 || SrcAlign >= 16)))) {

+ if (Op.size() >= 16 &&

+ (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

// FIXME: Check if unaligned 64-byte accesses are slow.

- if (Size >= 64 && Subtarget.hasAVX512() &&

+ if (Op.size() >= 64 && Subtarget.hasAVX512() &&

(Subtarget.getPreferVectorWidth() >= 512)) {

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

}

// FIXME: Check if unaligned 32-byte accesses are slow.

- if (Size >= 32 && Subtarget.hasAVX() &&

+ if (Op.size() >= 32 && Subtarget.hasAVX() &&

(Subtarget.getPreferVectorWidth() >= 256)) {

// Although this isn't a well-supported type for AVX1, we'll let

// legalization and shuffle lowering produce the optimal codegen. If we

@@ -2280,8 +2263,8 @@ EVT X86TargetLowering::getOptimalMemOpType(

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

(Subtarget.getPreferVectorWidth() >= 128))

return MVT::v4f32;

- } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&

- !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

+ } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

+ Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

// Do not use f64 to lower memcpy if source is string constant. It's

// better to use i32 to avoid the loads.

// Also, do not use f64 to lower memset unless this is a memset of zeros.

@@ -2294,7 +2277,7 @@ EVT X86TargetLowering::getOptimalMemOpType(

// This is a compromise. If we reach here, unaligned accesses may be slow on

// this target. However, creating smaller, aligned accesses could be even

// slower and would certainly be a lot more code.

- if (Subtarget.is64Bit() && Size >= 8)

+ if (Subtarget.is64Bit() && Op.size() >= 8)

return MVT::i64;

return MVT::i32;

}

@@ -2611,7 +2594,7 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

/// Breaks v64i1 value into two registers and adds the new node to the DAG

static void Passv64i1ArgInRegs(

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

- SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,

+ SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

assert(Subtarget.hasBWI() && "Expected AVX512BW target!");

assert(Subtarget.is32Bit() && "Expecting 32 bit target");

@@ -2656,14 +2639,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

- SDValue Flag;

- SmallVector<SDValue, 6> RetOps;

- RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

- // Operand #1 = Bytes To Pop

- RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

- MVT::i32));

- // Copy the result values into the output registers.

+ SmallVector<std::pair<Register, SDValue>, 4> RetVals;

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

++I, ++OutsIndex) {

CCValAssign &VA = RVLocs[I];

@@ -2715,7 +2691,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

// change the value to the FP stack register class.

if (isScalarFPTypeInSSEReg(VA.getValVT()))

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

- RetOps.push_back(ValToCopy);

+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

// Don't emit a copytoreg.

continue;

}

@@ -2736,31 +2712,39 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

}

- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

if (VA.needsCustom()) {

assert(VA.getValVT() == MVT::v64i1 &&

"Currently the only custom case is when we split v64i1 to 2 regs");

- Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],

+ Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

Subtarget);

- assert(2 == RegsToPass.size() &&

- "Expecting two registers after Pass64BitArgInRegs");

// Add the second register to the CalleeSaveDisableRegs list.

if (ShouldDisableCalleeSavedRegister)

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

} else {

- RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

}

+ }

- // Add nodes to the DAG and add the values into the RetOps list

- for (auto &Reg : RegsToPass) {

- Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);

- Flag = Chain.getValue(1);

- RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));

+ SDValue Flag;

+ SmallVector<SDValue, 6> RetOps;

+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

+ // Operand #1 = Bytes To Pop

+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

+ MVT::i32));

+ // Copy the result values into the output registers.

+ for (auto &RetVal : RetVals) {

+ if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

+ RetOps.push_back(RetVal.second);

+ continue; // Don't emit a copytoreg.

}

+ Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);

+ Flag = Chain.getValue(1);

+ RetOps.push_back(

+ DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

}

// Swift calling convention does not require we copy the sret argument

@@ -2775,7 +2759,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

// false, then an sret argument may be implicitly inserted in the SelDAG. In

// either case FuncInfo->setSRetReturnReg() will have been called.

- if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {

+ if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

// When we have both sret and another return value, we should use the

// original Chain stored in RetOps[0], instead of the current Chain updated

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

@@ -2798,7 +2782,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

getPointerTy(MF.getDataLayout()));

- unsigned RetValReg

+ Register RetValReg

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

X86::RAX : X86::EAX;

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

@@ -2924,7 +2908,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

if (nullptr == InFlag) {

// When no physical register is present,

// create an intermediate virtual register.

- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);

+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

@@ -3133,10 +3117,10 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

SelectionDAG &DAG, const SDLoc &dl) {

SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

- return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),

- /*isVolatile*/false, /*AlwaysInline=*/true,

- /*isTailCall*/false,

- MachinePointerInfo(), MachinePointerInfo());

+ return DAG.getMemcpy(

+ Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

+ /*isVolatile*/ false, /*AlwaysInline=*/true,

+ /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

}

/// Return true if the calling convention is one that we can guarantee TCO for.

@@ -3176,8 +3160,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

if (!CI->isTailCall())

return false;

- ImmutableCallSite CS(CI);

- CallingConv::ID CalleeCC = CS.getCallingConv();

+ CallingConv::ID CalleeCC = CI->getCallingConv();

if (!mayTailCallThisCC(CalleeCC))

return false;

@@ -3341,20 +3324,223 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

#ifndef NDEBUG

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

- return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),

- [](const CCValAssign &A, const CCValAssign &B) -> bool {

- return A.getValNo() < B.getValNo();

- });

+ return llvm::is_sorted(

+ ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

+ return A.getValNo() < B.getValNo();

+ });

}

#endif

+namespace {

+/// This is a helper class for lowering variable arguments parameters.

+class VarArgsLoweringHelper {

+public:

+ VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

+ SelectionDAG &DAG, const X86Subtarget &Subtarget,

+ CallingConv::ID CallConv, CCState &CCInfo)

+ : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

+ TheMachineFunction(DAG.getMachineFunction()),

+ TheFunction(TheMachineFunction.getFunction()),

+ FrameInfo(TheMachineFunction.getFrameInfo()),

+ FrameLowering(*Subtarget.getFrameLowering()),

+ TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

+ CCInfo(CCInfo) {}

+ // Lower variable arguments parameters.

+ void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

+private:

+ void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

+ void forwardMustTailParameters(SDValue &Chain);

+ bool is64Bit() { return Subtarget.is64Bit(); }

+ bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }

+ X86MachineFunctionInfo *FuncInfo;

+ const SDLoc &DL;

+ SelectionDAG &DAG;

+ const X86Subtarget &Subtarget;

+ MachineFunction &TheMachineFunction;

+ const Function &TheFunction;

+ MachineFrameInfo &FrameInfo;

+ const TargetFrameLowering &FrameLowering;

+ const TargetLowering &TargLowering;

+ CallingConv::ID CallConv;

+ CCState &CCInfo;

+};

+} // namespace

+void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

+ SDValue &Chain, unsigned StackSize) {

+ // If the function takes variable number of arguments, make a frame index for

+ // the start of the first vararg value... for expansion of llvm.va_start. We

+ // can skip this if there are no va_start calls.

+ if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

+ CallConv != CallingConv::X86_ThisCall)) {

+ FuncInfo->setVarArgsFrameIndex(

+ FrameInfo.CreateFixedObject(1, StackSize, true));

+ }

+ // Figure out if XMM registers are in use.

+ assert(!(Subtarget.useSoftFloat() &&

+ TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&

+ "SSE register cannot be used when SSE is disabled!");

+ // 64-bit calling conventions support varargs and register parameters, so we

+ // have to do extra work to spill them in the prologue.

+ if (is64Bit()) {

+ // Find the first unallocated argument registers.

+ ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

+ ArrayRef<MCPhysReg> ArgXMMs =

+ get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

+ assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&

+ "SSE register cannot be used when SSE is disabled!");

+ if (isWin64()) {

+ // Get to the caller-allocated home save location. Add 8 to account

+ // for the return address.

+ int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

+ FuncInfo->setRegSaveFrameIndex(

+ FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

+ // Fixup to set vararg frame on shadow area (4 x i64).

+ if (NumIntRegs < 4)

+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

+ } else {

+ // For X86-64, if there are vararg parameters that are passed via

+ // registers, then we must store them to their spots on the stack so

+ // they may be loaded by dereferencing the result of va_next.

+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

+ FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

+ }

+ SmallVector<SDValue, 6>

+ LiveGPRs; // list of SDValue for GPR registers keeping live input value

+ SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

+ // keeping live input value

+ SDValue ALVal; // if applicable keeps SDValue for %al register

+ // Gather all the live in physical registers.

+ for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

+ Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

+ LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

+ }

+ const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

+ if (!AvailableXmms.empty()) {

+ Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

+ ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

+ for (MCPhysReg Reg : AvailableXmms) {

+ Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);

+ LiveXMMRegs.push_back(

+ DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));

+ }

+ // Store the integer parameter registers.

+ SmallVector<SDValue, 8> MemOps;

+ SDValue RSFIN =

+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

+ TargLowering.getPointerTy(DAG.getDataLayout()));

+ unsigned Offset = FuncInfo->getVarArgsGPOffset();

+ for (SDValue Val : LiveGPRs) {

+ SDValue FIN = DAG.getNode(ISD::ADD, DL,

+ TargLowering.getPointerTy(DAG.getDataLayout()),

+ RSFIN, DAG.getIntPtrConstant(Offset, DL));

+ SDValue Store =

+ DAG.getStore(Val.getValue(1), DL, Val, FIN,

+ MachinePointerInfo::getFixedStack(

+ DAG.getMachineFunction(),

+ FuncInfo->getRegSaveFrameIndex(), Offset));

+ MemOps.push_back(Store);

+ Offset += 8;

+ }

+ // Now store the XMM (fp + vector) parameter registers.

+ if (!LiveXMMRegs.empty()) {

+ SmallVector<SDValue, 12> SaveXMMOps;

+ SaveXMMOps.push_back(Chain);

+ SaveXMMOps.push_back(ALVal);

+ SaveXMMOps.push_back(

+ DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));

+ SaveXMMOps.push_back(

+ DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));

+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),

+ LiveXMMRegs.end());

+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,

+ MVT::Other, SaveXMMOps));

+ }

+ if (!MemOps.empty())

+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

+ }

+void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

+ // Find the largest legal vector type.

+ MVT VecVT = MVT::Other;

+ // FIXME: Only some x86_32 calling conventions support AVX512.

+ if (Subtarget.useAVX512Regs() &&

+ (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

+ CallConv == CallingConv::Intel_OCL_BI)))

+ VecVT = MVT::v16f32;

+ else if (Subtarget.hasAVX())

+ VecVT = MVT::v8f32;

+ else if (Subtarget.hasSSE2())

+ VecVT = MVT::v4f32;

+ // We forward some GPRs and some vector types.

+ SmallVector<MVT, 2> RegParmTypes;

+ MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

+ RegParmTypes.push_back(IntVT);

+ if (VecVT != MVT::Other)

+ RegParmTypes.push_back(VecVT);

+ // Compute the set of forwarded registers. The rest are scratch.

+ SmallVectorImpl<ForwardedRegister> &Forwards =

+ FuncInfo->getForwardedMustTailRegParms();

+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

+ // Forward AL for SysV x86_64 targets, since it is used for varargs.

+ if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

+ Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

+ }

+ // Copy all forwards from physical to virtual registers.

+ for (ForwardedRegister &FR : Forwards) {

+ // FIXME: Can we use a less constrained schedule?

+ SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

+ FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

+ TargLowering.getRegClassFor(FR.VT));

+ Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

+ }

+void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

+ unsigned StackSize) {

+ // Set FrameIndex to the 0xAAAAAAA value to mark unset state.

+ // If necessary, it would be set into the correct value later.

+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

+ if (FrameInfo.hasVAStart())

+ createVarArgAreaAndStoreRegisters(Chain, StackSize);

+ if (FrameInfo.hasMustTailInVarArgFunc())

+ forwardMustTailParameters(Chain);

SDValue X86TargetLowering::LowerFormalArguments(

- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

MachineFunction &MF = DAG.getMachineFunction();

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

const Function &F = MF.getFunction();

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

@@ -3366,16 +3552,16 @@ SDValue X86TargetLowering::LowerFormalArguments(

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

assert(

- !(isVarArg && canGuaranteeTCO(CallConv)) &&

+ !(IsVarArg && canGuaranteeTCO(CallConv)) &&

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

// Assign locations to all of the incoming arguments.

SmallVector<CCValAssign, 16> ArgLocs;

- CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

// Allocate shadow area for Win64.

if (IsWin64)

- CCInfo.AllocateStack(32, 8);

+ CCInfo.AllocateStack(32, Align(8));

CCInfo.AnalyzeArguments(Ins, CC_X86);

@@ -3446,7 +3632,7 @@ SDValue X86TargetLowering::LowerFormalArguments(

else

llvm_unreachable("Unknown argument type!");

- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);

+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

}

@@ -3500,7 +3686,7 @@ SDValue X86TargetLowering::LowerFormalArguments(

// the argument into a virtual register so that we can access it from the

// return points.

if (Ins[I].Flags.isSRet()) {

- unsigned Reg = FuncInfo->getSRetReturnReg();

+ Register Reg = FuncInfo->getSRetReturnReg();

if (!Reg) {

MVT PtrTy = getPointerTy(DAG.getDataLayout());

Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

@@ -3518,147 +3704,12 @@ SDValue X86TargetLowering::LowerFormalArguments(

MF.getTarget().Options.GuaranteedTailCallOpt))

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

- // If the function takes variable number of arguments, make a frame index for

- // the start of the first vararg value... for expansion of llvm.va_start. We

- // can skip this if there are no va_start calls.

- if (MFI.hasVAStart() &&

- (Is64Bit || (CallConv != CallingConv::X86_FastCall &&

- CallConv != CallingConv::X86_ThisCall))) {

- FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));

- }

- // Figure out if XMM registers are in use.

- assert(!(Subtarget.useSoftFloat() &&

- F.hasFnAttribute(Attribute::NoImplicitFloat)) &&

- "SSE register cannot be used when SSE is disabled!");

- // 64-bit calling conventions support varargs and register parameters, so we

- // have to do extra work to spill them in the prologue.

- if (Is64Bit && isVarArg && MFI.hasVAStart()) {

- // Find the first unallocated argument registers.

- ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

- ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);

- unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

- assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&

- "SSE register cannot be used when SSE is disabled!");

- // Gather all the live in physical registers.

- SmallVector<SDValue, 6> LiveGPRs;

- SmallVector<SDValue, 8> LiveXMMRegs;

- SDValue ALVal;

- for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

- unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);

- LiveGPRs.push_back(

- DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));

- }

- if (!ArgXMMs.empty()) {

- unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

- ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);

- for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {

- unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);

- LiveXMMRegs.push_back(

- DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));

- }

- if (IsWin64) {

- // Get to the caller-allocated home save location. Add 8 to account

- // for the return address.

- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;

- FuncInfo->setRegSaveFrameIndex(

- MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

- // Fixup to set vararg frame on shadow area (4 x i64).

- if (NumIntRegs < 4)

- FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

- } else {

- // For X86-64, if there are vararg parameters that are passed via

- // registers, then we must store them to their spots on the stack so

- // they may be loaded by dereferencing the result of va_next.

- FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

- FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

- FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(

- ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));

- }

- // Store the integer parameter registers.

- SmallVector<SDValue, 8> MemOps;

- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

- getPointerTy(DAG.getDataLayout()));

- unsigned Offset = FuncInfo->getVarArgsGPOffset();

- for (SDValue Val : LiveGPRs) {

- SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

- RSFIN, DAG.getIntPtrConstant(Offset, dl));

- SDValue Store =

- DAG.getStore(Val.getValue(1), dl, Val, FIN,

- MachinePointerInfo::getFixedStack(

- DAG.getMachineFunction(),

- FuncInfo->getRegSaveFrameIndex(), Offset));

- MemOps.push_back(Store);

- Offset += 8;

- }

- if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {

- // Now store the XMM (fp + vector) parameter registers.

- SmallVector<SDValue, 12> SaveXMMOps;

- SaveXMMOps.push_back(Chain);

- SaveXMMOps.push_back(ALVal);

- SaveXMMOps.push_back(DAG.getIntPtrConstant(

- FuncInfo->getRegSaveFrameIndex(), dl));

- SaveXMMOps.push_back(DAG.getIntPtrConstant(

- FuncInfo->getVarArgsFPOffset(), dl));

- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),

- LiveXMMRegs.end());

- MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,

- MVT::Other, SaveXMMOps));

- }

- if (!MemOps.empty())

- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

- }

- if (isVarArg && MFI.hasMustTailInVarArgFunc()) {

- // Find the largest legal vector type.

- MVT VecVT = MVT::Other;

- // FIXME: Only some x86_32 calling conventions support AVX512.

- if (Subtarget.useAVX512Regs() &&

- (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||

- CallConv == CallingConv::Intel_OCL_BI)))

- VecVT = MVT::v16f32;

- else if (Subtarget.hasAVX())

- VecVT = MVT::v8f32;

- else if (Subtarget.hasSSE2())

- VecVT = MVT::v4f32;

- // We forward some GPRs and some vector types.

- SmallVector<MVT, 2> RegParmTypes;

- MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;

- RegParmTypes.push_back(IntVT);

- if (VecVT != MVT::Other)

- RegParmTypes.push_back(VecVT);

- // Compute the set of forwarded registers. The rest are scratch.

- SmallVectorImpl<ForwardedRegister> &Forwards =

- FuncInfo->getForwardedMustTailRegParms();

- CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

- // Forward AL for SysV x86_64 targets, since it is used for varargs.

- if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {

- unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

- Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

- }

- // Copy all forwards from physical to virtual registers.

- for (ForwardedRegister &FR : Forwards) {

- // FIXME: Can we use a less constrained schedule?

- SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);

- FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));

- Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);

- }

+ if (IsVarArg)

+ VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

+ .lowerVarArgsParameters(Chain, StackSize);

// Some CCs need callee pop.

- if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

+ if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

MF.getTarget().Options.GuaranteedTailCallOpt)) {

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

@@ -3677,10 +3728,6 @@ SDValue X86TargetLowering::LowerFormalArguments(

if (!Is64Bit) {

// RegSaveFrameIndex is X86-64 only.

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

- if (CallConv == CallingConv::X86_FastCall ||

- CallConv == CallingConv::X86_ThisCall)

- // fastcc functions can't have varargs.

- FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

}

FuncInfo->setArgumentStackSize(StackSize);

@@ -3697,7 +3744,7 @@ SDValue X86TargetLowering::LowerFormalArguments(

// same, so the size of funclets' (mostly empty) frames is dictated by

// how far this slot is from the bottom (since they allocate just enough

// space to accommodate holding this slot at the correct offset).

- int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);

+ int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);

EHInfo->PSPSymFrameIdx = PSPSymFI;

}

@@ -3705,7 +3752,7 @@ SDValue X86TargetLowering::LowerFormalArguments(

if (CallConv == CallingConv::X86_RegCall ||

F.hasFnAttribute("no_caller_saved_registers")) {

MachineRegisterInfo &MRI = MF.getRegInfo();

- for (std::pair<unsigned, unsigned> Pair : MRI.liveins())

+ for (std::pair<Register, Register> Pair : MRI.liveins())

MRI.disableCalleeSavedRegister(Pair.first);

}

@@ -3716,12 +3763,13 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

SDValue Arg, const SDLoc &dl,

SelectionDAG &DAG,

const CCValAssign &VA,

- ISD::ArgFlagsTy Flags) const {

+ ISD::ArgFlagsTy Flags,

+ bool isByVal) const {

unsigned LocMemOffset = VA.getLocMemOffset();

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

StackPtr, PtrOff);

- if (Flags.isByVal())

+ if (isByVal)

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

return DAG.getStore(

@@ -3796,18 +3844,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

CallConv == CallingConv::Tail;

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

- const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());

+ const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);

const Function *Fn = CI ? CI->getCalledFunction() : nullptr;

bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||

(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

- const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());

+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);

bool HasNoCfCheck =

(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());

const Module *M = MF.getMMI().getModule();

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

MachineFunction::CallSiteInfo CSInfo;

if (CallConv == CallingConv::X86_INTR)

report_fatal_error("X86 interrupts may not be called directly");

@@ -3823,7 +3870,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

isTailCall = false;

}

- bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();

+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

if (IsMustTail) {

// Force this to be a tail call. The verifier rules are enough to ensure

// that we can lower this successfully without moving the return address

@@ -3854,7 +3901,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

// Allocate shadow area for Win64.

if (IsWin64)

- CCInfo.AllocateStack(32, 8);

+ CCInfo.AllocateStack(32, Align(8));

CCInfo.AnalyzeArguments(Outs, CC_X86);

@@ -3900,6 +3947,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

if (ArgLocs.back().getLocMemOffset() != 0)

report_fatal_error("any parameter with the inalloca attribute must be "

"the only memory argument");

+ } else if (CLI.IsPreallocated) {

+ assert(ArgLocs.back().isMemLoc() &&

+ "cannot use preallocated attribute on a register "

+ "parameter");

+ SmallVector<size_t, 4> PreallocatedOffsets;

+ for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

+ if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

+ PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

+ }

+ auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

+ size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

+ MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

+ MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

+ NumBytesToPush = 0;

}

if (!IsSibcall && !IsMustTail)

@@ -3912,7 +3974,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

Is64Bit, FPDiff, dl);

- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

+ SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

SmallVector<SDValue, 8> MemOpChains;

SDValue StackPtr;

@@ -3927,9 +3989,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

++I, ++OutIndex) {

assert(OutIndex < Outs.size() && "Invalid Out index");

- // Skip inalloca arguments, they have already been written.

+ // Skip inalloca/preallocated arguments, they have already been written.

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

- if (Flags.isInAlloca())

+ if (Flags.isInAlloca() || Flags.isPreallocated())

continue;

CCValAssign &VA = ArgLocs[I];

@@ -3968,8 +4030,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

// the caller from seeing any modifications the callee may make

// as guaranteed by the `byval` attribute.

int FrameIdx = MF.getFrameInfo().CreateStackObject(

- Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),

- false);

+ Flags.getByValSize(),

+ std::max(Align(16), Flags.getNonZeroByValAlign()), false);

SDValue StackSlot =

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

Chain =

@@ -3998,12 +4060,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

} else if (VA.isRegLoc()) {

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

const TargetOptions &Options = DAG.getTarget().Options;

- if (Options.EnableDebugEntryValues)

+ if (Options.EmitCallSiteInfo)

CSInfo.emplace_back(VA.getLocReg(), I);

if (isVarArg && IsWin64) {

// Win64 ABI requires argument XMM reg to be copied to the corresponding

// shadow reg if callee is a varargs function.

- unsigned ShadowReg = 0;

+ Register ShadowReg;

switch (VA.getLocReg()) {

case X86::XMM0: ShadowReg = X86::RCX; break;

case X86::XMM1: ShadowReg = X86::RDX; break;

@@ -4019,7 +4081,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

getPointerTy(DAG.getDataLayout()));

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

- dl, DAG, VA, Flags));

+ dl, DAG, VA, Flags, isByVal));

}

@@ -4031,7 +4093,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

// GOT pointer.

if (!isTailCall) {

RegsToPass.push_back(std::make_pair(

- unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

+ Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

getPointerTy(DAG.getDataLayout()))));

} else {

// If we are tail calling and generating PIC/GOT style code load the

@@ -4069,8 +4131,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

assert((Subtarget.hasSSE1() || !NumXMMRegs)

&& "SSE registers cannot be used when SSE is disabled");

- RegsToPass.push_back(std::make_pair(unsigned(X86::AL),

+ RegsToPass.push_back(std::make_pair(Register(X86::AL),

DAG.getConstant(NumXMMRegs, dl,

MVT::i8)));

}

@@ -4079,7 +4140,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

for (const auto &F : Forwards) {

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

- RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));

+ RegsToPass.push_back(std::make_pair(F.PReg, Val));

}

@@ -4117,8 +4178,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

assert(VA.isMemLoc());

SDValue Arg = OutVals[OutsIndex];

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

- // Skip inalloca arguments. They don't require any work.

- if (Flags.isInAlloca())

+ // Skip inalloca/preallocated arguments. They don't require any work.

+ if (Flags.isInAlloca() || Flags.isPreallocated())

continue;

// Create frame index.

int32_t Offset = VA.getLocMemOffset()+FPDiff;

@@ -4219,7 +4280,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

// is thrown, the runtime will not restore CSRs.

// FIXME: Model this more precisely so that we can register allocate across

// the normal edge and spill and fill across the exceptional edge.

- if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {

+ if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

const Function &CallerFn = MF.getFunction();

EHPersonality Pers =

CallerFn.hasPersonalityFn()

@@ -4278,11 +4339,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

}

InFlag = Chain.getValue(1);

+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

// Save heapallocsite metadata.

- if (CLI.CS)

- if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))

+ if (CLI.CB)

+ if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

// Create the CALLSEQ_END node.

@@ -4301,12 +4363,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

else

NumBytesForCalleeToPop = 0; // Callee pops nothing.

- if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {

- // No need to reset the stack after the call if the call doesn't return. To

- // make the MI verify, we'll pretend the callee does it for us.

- NumBytesForCalleeToPop = NumBytes;

- }

// Returns a flag for retval copy to use.

if (!IsSibcall) {

Chain = DAG.getCALLSEQ_END(Chain,

@@ -4337,7 +4393,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

// (within module) calls are supported at the moment.

// To keep the stack aligned according to platform abi the function

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

-// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)

+// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

// If a tail called function callee has more arguments than the caller the

// caller needs to make sure that there is room to move the RETADDR to. This is

// achieved by reserving an area the size of the argument delta right after the

@@ -4359,7 +4415,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

unsigned

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

SelectionDAG &DAG) const {

- const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());

+ const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

assert(StackSize % SlotSize == 0 &&

"StackSize must be a multiple of SlotSize");

@@ -4395,7 +4451,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

int FI = INT_MAX;

if (Arg.getOpcode() == ISD::CopyFromReg) {

- unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

+ Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

if (!Register::isVirtualRegister(VR))

return false;

MachineInstr *Def = MRI->getVRegDef(VR);

@@ -4578,7 +4634,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(

// Allocate shadow area for Win64

if (IsCalleeWin64)

- CCInfo.AllocateStack(32, 8);

+ CCInfo.AllocateStack(32, Align(8));

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

StackArgsSize = CCInfo.getNextStackOffset();

@@ -4693,6 +4749,7 @@ static bool isTargetShuffle(unsigned Opcode) {

case X86ISD::INSERTPS:

case X86ISD::EXTRQI:

case X86ISD::INSERTQI:

+ case X86ISD::VALIGN:

case X86ISD::PALIGNR:

case X86ISD::VSHLDQ:

case X86ISD::VSRLDQ:

@@ -4739,6 +4796,13 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {

}

+static bool isTargetShuffleSplat(SDValue Op) {

+ unsigned Opcode = Op.getOpcode();

+ if (Opcode == ISD::EXTRACT_SUBVECTOR)

+ return isTargetShuffleSplat(Op.getOperand(0));

+ return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

MachineFunction &MF = DAG.getMachineFunction();

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

@@ -4972,7 +5036,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

ScalarVT = MVT::i32;

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags |= MachineMemOperand::MOStore;

break;

}

@@ -4985,7 +5049,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

IndexVT.getVectorNumElements());

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags |= MachineMemOperand::MOLoad;

break;

}

@@ -4997,7 +5061,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

IndexVT.getVectorNumElements());

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags |= MachineMemOperand::MOStore;

break;

}

@@ -5146,7 +5210,8 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

}

-bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {

+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

+ bool) const {

// TODO: Allow vectors?

if (VT.isVector())

return false;

@@ -5374,6 +5439,19 @@ static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

}

+/// Return true if the value of any element in Mask is the zero sentinel value.

+static bool isAnyZero(ArrayRef<int> Mask) {

+ return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

+/// Return true if the value of any element in Mask is the zero or undef

+/// sentinel values.

+static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

+ return llvm::any_of(Mask, [](int M) {

+ return M == SM_SentinelZero || M == SM_SentinelUndef;

+ });

/// Return true if Val is undef or if its value falls within the

/// specified range (L, H].

static bool isUndefOrInRange(int Val, int Low, int Hi) {

@@ -5511,6 +5589,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) {

return canWidenShuffleElements(Mask, WidenedMask);

}

+// Attempt to narrow/widen shuffle mask until it matches the target number of

+// elements.

+static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

+ SmallVectorImpl<int> &ScaledMask) {

+ unsigned NumSrcElts = Mask.size();

+ assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&

+ "Illegal shuffle scale factor");

+ // Narrowing is guaranteed to work.

+ if (NumDstElts >= NumSrcElts) {

+ int Scale = NumDstElts / NumSrcElts;

+ llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

+ return true;

+ }

+ // We have to repeat the widening until we reach the target size, but we can

+ // split out the first widening as it sets up ScaledMask for us.

+ if (canWidenShuffleElements(Mask, ScaledMask)) {

+ while (ScaledMask.size() > NumDstElts) {

+ SmallVector<int, 16> WidenedMask;

+ if (!canWidenShuffleElements(ScaledMask, WidenedMask))

+ return false;

+ ScaledMask = std::move(WidenedMask);

+ }

+ return true;

+ }

+ return false;

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

bool X86::isZeroNode(SDValue Elt) {

return isNullConstant(Elt) || isNullFPConstant(Elt);

@@ -5725,7 +5833,7 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

}

-// Helper function to collect subvector ops that are concated together,

+// Helper function to collect subvector ops that are concatenated together,

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

// The subvectors in Ops are guaranteed to be the same type.

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {

@@ -5736,8 +5844,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {

return true;

}

- if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&

- isa<ConstantSDNode>(N->getOperand(2))) {

+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

SDValue Src = N->getOperand(0);

SDValue Sub = N->getOperand(1);

const APInt &Idx = N->getConstantOperandAPInt(2);

@@ -5746,19 +5853,93 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {

// TODO - Handle more general insert_subvector chains.

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&

- Idx == (VT.getVectorNumElements() / 2) &&

- Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

- Src.getOperand(1).getValueType() == SubVT &&

- isNullConstant(Src.getOperand(2))) {

- Ops.push_back(Src.getOperand(1));

- Ops.push_back(Sub);

- return true;

+ Idx == (VT.getVectorNumElements() / 2)) {

+ // insert_subvector(insert_subvector(undef, x, lo), y, hi)

+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

+ Src.getOperand(1).getValueType() == SubVT &&

+ isNullConstant(Src.getOperand(2))) {

+ Ops.push_back(Src.getOperand(1));

+ Ops.push_back(Sub);

+ return true;

+ }

+ // insert_subvector(x, extract_subvector(x, lo), hi)

+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

+ Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

+ Ops.append(2, Sub);

+ return true;

+ }

}

return false;

}

+static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

+ const SDLoc &dl) {

+ EVT VT = Op.getValueType();

+ unsigned NumElems = VT.getVectorNumElements();

+ unsigned SizeInBits = VT.getSizeInBits();

+ assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&

+ "Can't split odd sized vector");

+ SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

+ SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

+ return std::make_pair(Lo, Hi);

+// Split an unary integer op into 2 half sized ops.

+static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

+ EVT VT = Op.getValueType();

+ // Make sure we only try to split 256/512-bit types to avoid creating

+ // narrow vectors.

+ assert((Op.getOperand(0).getValueType().is256BitVector() ||

+ Op.getOperand(0).getValueType().is512BitVector()) &&

+ (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");

+ assert(Op.getOperand(0).getValueType().getVectorNumElements() ==

+ VT.getVectorNumElements() &&

+ "Unexpected VTs!");

+ SDLoc dl(Op);

+ // Extract the Lo/Hi vectors

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);

+ EVT LoVT, HiVT;

+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

+ DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),

+ DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));

+/// Break a binary integer operation into 2 half sized ops and then

+/// concatenate the result back.

+static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

+ EVT VT = Op.getValueType();

+ // Sanity check that all the types match.

+ assert(Op.getOperand(0).getValueType() == VT &&

+ Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");

+ assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");

+ SDLoc dl(Op);

+ // Extract the LHS Lo/Hi vectors

+ SDValue LHS1, LHS2;

+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

+ // Extract the RHS Lo/Hi vectors

+ SDValue RHS1, RHS2;

+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

+ EVT LoVT, HiVT;

+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

+ DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),

+ DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));

// Helper for splitting operands of an operation to legal target size and

// apply a function on each part.

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

@@ -5815,21 +5996,17 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

SDValue Vec = Op.getOperand(0);

SDValue SubVec = Op.getOperand(1);

SDValue Idx = Op.getOperand(2);

- if (!isa<ConstantSDNode>(Idx))

- return SDValue();

+ unsigned IdxVal = Op.getConstantOperandVal(2);

// Inserting undef is a nop. We can just return the original vector.

if (SubVec.isUndef())

return Vec;

- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

return Op;

MVT OpVT = Op.getSimpleValueType();

unsigned NumElems = OpVT.getVectorNumElements();

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

// Extend to natively supported kshift.

@@ -5849,7 +6026,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

MVT SubVecVT = SubVec.getSimpleValueType();

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

assert(IdxVal + SubVecNumElems <= NumElems &&

IdxVal % SubVecVT.getSizeInBits() == 0 &&

"Unexpected index value in INSERT_SUBVECTOR");

@@ -5900,7 +6076,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

if (SubVecNumElems * 2 == NumElems) {

// Special case, use legal zero extending insert_subvector. This allows

- // isel to opimitize when bits are known zero.

+ // isel to optimize when bits are known zero.

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

DAG.getConstant(0, dl, WideOpVT),

@@ -6042,8 +6218,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,

// Match (xor X, -1) -> X.

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

- V = peekThroughBitcasts(V);

+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {

+ V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);

if (V.getOpcode() == ISD::XOR &&

ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))

return V.getOperand(0);

@@ -6067,6 +6243,35 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

return SDValue();

}

+void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

+ bool Lo, bool Unary) {

+ assert(Mask.empty() && "Expected an empty shuffle mask vector");

+ int NumElts = VT.getVectorNumElements();

+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();

+ for (int i = 0; i < NumElts; ++i) {

+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;

+ Pos += (Unary ? 0 : NumElts * (i % 2));

+ Pos += (Lo ? 0 : NumEltsInLane / 2);

+ Mask.push_back(Pos);

+ }

+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

+/// imposed by AVX and specific to the unary pattern. Example:

+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

+void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

+ bool Lo) {

+ assert(Mask.empty() && "Expected an empty shuffle mask vector");

+ int NumElts = VT.getVectorNumElements();

+ for (int i = 0; i < NumElts; ++i) {

+ int Pos = i / 2;

+ Pos += (Lo ? 0 : NumElts / 2);

+ Mask.push_back(Pos);

+ }

/// Returns a vector_shuffle node for an unpackl operation.

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,

SDValue V1, SDValue V2) {

@@ -6102,14 +6307,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

}

-static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

- if (!Load || !ISD::isNormalLoad(Load))

- return nullptr;

- SDValue Ptr = Load->getBasePtr();

- if (Ptr->getOpcode() == X86ISD::Wrapper ||

- Ptr->getOpcode() == X86ISD::WrapperRIP)

- Ptr = Ptr->getOperand(0);

+static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

+ if (Ptr.getOpcode() == X86ISD::Wrapper ||

+ Ptr.getOpcode() == X86ISD::WrapperRIP)

+ Ptr = Ptr.getOperand(0);

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

@@ -6118,6 +6319,12 @@ static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

return CNode->getConstVal();

}

+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

+ if (!Load || !ISD::isNormalLoad(Load))

+ return nullptr;

+ return getTargetConstantFromBasePtr(Load->getBasePtr());

static const Constant *getTargetConstantFromNode(SDValue Op) {

Op = peekThroughBitcasts(Op);

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

@@ -6298,23 +6505,6 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

}

// Extract constant bits from a broadcasted constant pool scalar.

- if (Op.getOpcode() == X86ISD::VBROADCAST &&

- EltSizeInBits <= VT.getScalarSizeInBits()) {

- if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {

- unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();

- unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

- APInt UndefSrcElts(NumSrcElts, 0);

- SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

- if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {

- if (UndefSrcElts[0])

- UndefSrcElts.setBits(0, NumSrcElts);

- SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

- return CastBitData(UndefSrcElts, SrcEltBits);

- }

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

EltSizeInBits <= VT.getScalarSizeInBits()) {

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

@@ -6322,16 +6512,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

return false;

SDValue Ptr = MemIntr->getBasePtr();

- if (Ptr->getOpcode() == X86ISD::Wrapper ||

- Ptr->getOpcode() == X86ISD::WrapperRIP)

- Ptr = Ptr->getOperand(0);

- auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

- if (!CNode || CNode->isMachineConstantPoolEntry() ||

- CNode->getOffset() != 0)

- return false;

- if (const Constant *C = CNode->getConstVal()) {

+ if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

@@ -6375,8 +6556,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

}

// Insert constant bits from a base and sub vector sources.

- if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&

- isa<ConstantSDNode>(Op.getOperand(2))) {

+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

// TODO - support insert_subvector through bitcasts.

if (EltSizeInBits != VT.getScalarSizeInBits())

return false;

@@ -6398,8 +6578,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

}

// Extract constant bits from a subvector's source.

- if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

- isa<ConstantSDNode>(Op.getOperand(1))) {

+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

// TODO - support extract_subvector through bitcasts.

if (EltSizeInBits != VT.getScalarSizeInBits())

return false;

@@ -6468,11 +6647,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

namespace llvm {

namespace X86 {

-bool isConstantSplat(SDValue Op, APInt &SplatVal) {

+bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

APInt UndefElts;

SmallVector<APInt, 16> EltBits;

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

- UndefElts, EltBits, true, false)) {

+ UndefElts, EltBits, true,

+ AllowPartialUndefs)) {

int SplatIndex = -1;

for (int i = 0, e = EltBits.size(); i != e; ++i) {

if (UndefElts[i])

@@ -6513,20 +6693,26 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,

}

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

+/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

/// Note: This ignores saturation, so inputs must be checked first.

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

- bool Unary) {

+ bool Unary, unsigned NumStages = 1) {

assert(Mask.empty() && "Expected an empty shuffle mask vector");

unsigned NumElts = VT.getVectorNumElements();

unsigned NumLanes = VT.getSizeInBits() / 128;

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

unsigned Offset = Unary ? 0 : NumElts;

+ unsigned Repetitions = 1u << (NumStages - 1);

+ unsigned Increment = 1u << NumStages;

+ assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

- for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)

- Mask.push_back(Elt + (Lane * NumEltsPerLane));

- for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)

- Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

+ for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

+ Mask.push_back(Elt + (Lane * NumEltsPerLane));

+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

+ Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

+ }

}

@@ -6597,7 +6783,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

unsigned MaskEltSize = VT.getScalarSizeInBits();

SmallVector<uint64_t, 32> RawMask;

APInt RawUndefs;

- SDValue ImmN;

+ uint64_t ImmN;

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

@@ -6608,23 +6794,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

case X86ISD::BLENDI:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodeBLENDMask(NumElems, ImmN, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

break;

case X86ISD::SHUFP:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodeSHUFPMask(NumElems, MaskEltSize,

- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

break;

case X86ISD::INSERTPS:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodeINSERTPSMask(ImmN, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

break;

case X86ISD::EXTRQI:

@@ -6672,13 +6857,23 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

DecodeMOVLHPSMask(NumElems, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

break;

+ case X86ISD::VALIGN:

+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&

+ "Only 32-bit and 64-bit elements are supported!");

+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodeVALIGNMask(NumElems, ImmN, Mask);

+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

+ Ops.push_back(N->getOperand(1));

+ Ops.push_back(N->getOperand(0));

+ break;

case X86ISD::PALIGNR:

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),

- Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodePALIGNRMask(NumElems, ImmN, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

Ops.push_back(N->getOperand(1));

Ops.push_back(N->getOperand(0));

@@ -6686,39 +6881,34 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

case X86ISD::VSHLDQ:

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),

- Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodePSLLDQMask(NumElems, ImmN, Mask);

IsUnary = true;

break;

case X86ISD::VSRLDQ:

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),

- Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodePSRLDQMask(NumElems, ImmN, Mask);

IsUnary = true;

break;

case X86ISD::PSHUFD:

case X86ISD::VPERMILPI:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodePSHUFMask(NumElems, MaskEltSize,

- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

IsUnary = true;

break;

case X86ISD::PSHUFHW:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),

- Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodePSHUFHWMask(NumElems, ImmN, Mask);

IsUnary = true;

break;

case X86ISD::PSHUFLW:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),

- Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodePSHUFLWMask(NumElems, ImmN, Mask);

IsUnary = true;

break;

case X86ISD::VZEXT_MOVL:

@@ -6770,8 +6960,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

}

case X86ISD::VPERMI:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodeVPERMMask(NumElems, ImmN, Mask);

IsUnary = true;

break;

case X86ISD::MOVSS:

@@ -6783,17 +6973,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

case X86ISD::VPERM2X128:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),

- Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

break;

case X86ISD::SHUF128:

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");

- ImmN = N->getOperand(N->getNumOperands() - 1);

- decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,

- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

+ decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

break;

case X86ISD::MOVSLDUP:

@@ -6875,9 +7063,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

return false;

// Check if we're getting a shuffle mask with zero'd elements.

- if (!AllowSentinelZero)

- if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))

- return false;

+ if (!AllowSentinelZero && isAnyZero(Mask))

+ return false;

// If we have a fake unary shuffle, the shuffle mask is spread across two

// inputs that are actually the same node. Re-map the mask to always point

@@ -7060,6 +7247,20 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

continue;

}

+ // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

+ // base vectors.

+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

+ SDValue Vec = V.getOperand(0);

+ int NumVecElts = Vec.getValueType().getVectorNumElements();

+ if (Vec.isUndef() && Size == NumVecElts) {

+ int Idx = V.getConstantOperandVal(2);

+ int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

+ if (M < Idx || (Idx + NumSubElts) <= M)

+ KnownUndef.setBit(i);

+ }

+ continue;

+ }

// Attempt to extract from the source's constant bits.

if (IsSrcConstant[SrcIdx]) {

if (UndefSrcElts[SrcIdx][M])

@@ -7111,7 +7312,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

// TODO: Use DemandedElts variant.

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

SmallVectorImpl<int> &Mask,

- SelectionDAG &DAG, unsigned Depth,

+ const SelectionDAG &DAG, unsigned Depth,

bool ResolveKnownElts);

// Attempt to decode ops that could be represented as a shuffle mask.

@@ -7120,7 +7321,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

SmallVectorImpl<int> &Mask,

SmallVectorImpl<SDValue> &Ops,

- SelectionDAG &DAG, unsigned Depth,

+ const SelectionDAG &DAG, unsigned Depth,

bool ResolveKnownElts) {

Mask.clear();

Ops.clear();

@@ -7132,6 +7333,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

return false;

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");

+ unsigned NumSizeInBytes = NumSizeInBits / 8;

+ unsigned NumBytesPerElt = NumBitsPerElt / 8;

unsigned Opcode = N.getOpcode();

switch (Opcode) {

@@ -7179,8 +7382,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);

if (Known0.One.isNullValue() && Known1.One.isNullValue()) {

bool IsByteMask = true;

- unsigned NumSizeInBytes = NumSizeInBits / 8;

- unsigned NumBytesPerElt = NumBitsPerElt / 8;

APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);

APInt SelectMask = APInt::getNullValue(NumBytesPerElt);

for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {

@@ -7220,10 +7421,21 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,

true))

return false;

+ // Shuffle inputs must be the same size as the result.

+ if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {

+ return VT.getSizeInBits() != Op.getValueSizeInBits();

+ }))

+ return false;

+ if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {

+ return VT.getSizeInBits() != Op.getValueSizeInBits();

+ }))

+ return false;

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

SmallVector<int, 64> Mask0, Mask1;

- scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

- scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

+ narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

+ narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

for (size_t i = 0; i != MaskSize; ++i) {

if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)

Mask.push_back(SM_SentinelUndef);

@@ -7245,14 +7457,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

SDValue Sub = N.getOperand(1);

EVT SubVT = Sub.getValueType();

unsigned NumSubElts = SubVT.getVectorNumElements();

- if (!isa<ConstantSDNode>(N.getOperand(2)) ||

- !N->isOnlyUserOf(Sub.getNode()))

+ if (!N->isOnlyUserOf(Sub.getNode()))

return false;

uint64_t InsertIdx = N.getConstantOperandVal(2);

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

- Sub.getOperand(0).getValueType() == VT &&

- isa<ConstantSDNode>(Sub.getOperand(1))) {

+ Sub.getOperand(0).getValueType() == VT) {

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

for (int i = 0; i != (int)NumElts; ++i)

Mask.push_back(i);

@@ -7268,13 +7478,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,

SubMask, DAG, Depth + 1, ResolveKnownElts))

return false;

+ // Subvector shuffle inputs must not be larger than the subvector.

+ if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

+ return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();

+ }))

+ return false;

if (SubMask.size() != NumSubElts) {

assert(((SubMask.size() % NumSubElts) == 0 ||

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");

if ((NumSubElts % SubMask.size()) == 0) {

int Scale = NumSubElts / SubMask.size();

SmallVector<int,64> ScaledSubMask;

- scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);

+ narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

SubMask = ScaledSubMask;

} else {

int Scale = SubMask.size() / NumSubElts;

@@ -7284,14 +7501,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

}

Ops.push_back(Src);

- for (SDValue &SubInput : SubInputs) {

- EVT SubSVT = SubInput.getValueType().getScalarType();

- EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,

- NumSizeInBits / SubSVT.getSizeInBits());

- Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,

- DAG.getUNDEF(AltVT), SubInput,

- DAG.getIntPtrConstant(0, SDLoc(N))));

- }

+ Ops.append(SubInputs.begin(), SubInputs.end());

for (int i = 0; i != (int)NumElts; ++i)

Mask.push_back(i);

for (int i = 0; i != (int)NumSubElts; ++i) {

@@ -7304,75 +7514,83 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

}

return true;

}

- case ISD::SCALAR_TO_VECTOR: {

- // Match against a scalar_to_vector of an extract from a vector,

- // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.

- SDValue N0 = N.getOperand(0);

- SDValue SrcExtract;

+ case X86ISD::PINSRB:

+ case X86ISD::PINSRW:

+ case ISD::SCALAR_TO_VECTOR:

+ case ISD::INSERT_VECTOR_ELT: {

+ // Match against a insert_vector_elt/scalar_to_vector of an extract from a

+ // vector, for matching src/dst vector types.

+ SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

+ unsigned DstIdx = 0;

+ if (Opcode != ISD::SCALAR_TO_VECTOR) {

+ // Check we have an in-range constant insertion index.

+ if (!isa<ConstantSDNode>(N.getOperand(2)) ||

+ N.getConstantOperandAPInt(2).uge(NumElts))

+ return false;

+ DstIdx = N.getConstantOperandVal(2);

- if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

- N0.getOperand(0).getValueType() == VT) ||

- (N0.getOpcode() == X86ISD::PEXTRW &&

- N0.getOperand(0).getValueType() == MVT::v8i16) ||

- (N0.getOpcode() == X86ISD::PEXTRB &&

- N0.getOperand(0).getValueType() == MVT::v16i8)) {

- SrcExtract = N0;

+ // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

+ if (X86::isZeroNode(Scl)) {

+ Ops.push_back(N.getOperand(0));

+ for (unsigned i = 0; i != NumElts; ++i)

+ Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

+ return true;

+ }

}

+ // Peek through trunc/aext/zext.

+ // TODO: aext shouldn't require SM_SentinelZero padding.

+ // TODO: handle shift of scalars.

+ unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

+ while (Scl.getOpcode() == ISD::TRUNCATE ||

+ Scl.getOpcode() == ISD::ANY_EXTEND ||

+ Scl.getOpcode() == ISD::ZERO_EXTEND) {

+ Scl = Scl.getOperand(0);

+ MinBitsPerElt =

+ std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

+ }

+ if ((MinBitsPerElt % 8) != 0)

+ return false;

+ // Attempt to find the source vector the scalar was extracted from.

+ SDValue SrcExtract;

+ if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

+ Scl.getOpcode() == X86ISD::PEXTRW ||

+ Scl.getOpcode() == X86ISD::PEXTRB) &&

+ Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

+ SrcExtract = Scl;

+ }

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

return false;

SDValue SrcVec = SrcExtract.getOperand(0);

EVT SrcVT = SrcVec.getValueType();

- unsigned NumSrcElts = SrcVT.getVectorNumElements();

- unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

- unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

- if (NumSrcElts <= SrcIdx)

+ if (!SrcVT.getScalarType().isByteSized())

return false;

- Ops.push_back(SrcVec);

- Mask.push_back(SrcIdx);

- Mask.append(NumZeros, SM_SentinelZero);

- Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);

- return true;

- }

- case X86ISD::PINSRB:

- case X86ISD::PINSRW: {

- SDValue InVec = N.getOperand(0);

- SDValue InScl = N.getOperand(1);

- SDValue InIndex = N.getOperand(2);

- if (!isa<ConstantSDNode>(InIndex) ||

- cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))

- return false;

- uint64_t InIdx = N.getConstantOperandVal(2);

- // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.

- if (X86::isZeroNode(InScl)) {

- Ops.push_back(InVec);

- for (unsigned i = 0; i != NumElts; ++i)

- Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);

- return true;

+ unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

+ unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

+ unsigned DstByte = DstIdx * NumBytesPerElt;

+ MinBitsPerElt =

+ std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

+ // Create 'identity' byte level shuffle mask and then add inserted bytes.

+ if (Opcode == ISD::SCALAR_TO_VECTOR) {

+ Ops.push_back(SrcVec);

+ Mask.append(NumSizeInBytes, SM_SentinelUndef);

+ } else {

+ Ops.push_back(SrcVec);

+ Ops.push_back(N.getOperand(0));

+ for (int i = 0; i != (int)NumSizeInBytes; ++i)

+ Mask.push_back(NumSizeInBytes + i);

}

- // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.

- // TODO: Expand this to support INSERT_VECTOR_ELT/etc.

- unsigned ExOp =

- (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);

- if (InScl.getOpcode() != ExOp)

- return false;

- SDValue ExVec = InScl.getOperand(0);

- SDValue ExIndex = InScl.getOperand(1);

- if (!isa<ConstantSDNode>(ExIndex) ||

- cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))

- return false;

- uint64_t ExIdx = InScl.getConstantOperandVal(1);

- Ops.push_back(InVec);

- Ops.push_back(ExVec);

- for (unsigned i = 0; i != NumElts; ++i)

- Mask.push_back(i == InIdx ? NumElts + ExIdx : i);

+ unsigned MinBytesPerElts = MinBitsPerElt / 8;

+ MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

+ for (unsigned i = 0; i != MinBytesPerElts; ++i)

+ Mask[DstByte + i] = SrcByte + i;

+ for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

+ Mask[DstByte + i] = SM_SentinelZero;

return true;

}

case X86ISD::PACKSS:

@@ -7412,6 +7630,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

createPackShuffleMask(VT, Mask, IsUnary);

return true;

}

+ case X86ISD::VTRUNC: {

+ SDValue Src = N.getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ // Truncated source must be a simple vector.

+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

+ (SrcVT.getScalarSizeInBits() % 8) != 0)

+ return false;

+ unsigned NumSrcElts = SrcVT.getVectorNumElements();

+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

+ unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

+ assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");

+ for (unsigned i = 0; i != NumSrcElts; ++i)

+ Mask.push_back(i * Scale);

+ Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

+ Ops.push_back(Src);

+ return true;

+ }

case X86ISD::VSHLI:

case X86ISD::VSRLI: {

uint64_t ShiftVal = N.getConstantOperandVal(1);

@@ -7426,40 +7661,43 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

break;

uint64_t ByteShift = ShiftVal / 8;

- unsigned NumBytes = NumSizeInBits / 8;

- unsigned NumBytesPerElt = NumBitsPerElt / 8;

Ops.push_back(N.getOperand(0));

// Clear mask to all zeros and insert the shifted byte indices.

- Mask.append(NumBytes, SM_SentinelZero);

+ Mask.append(NumSizeInBytes, SM_SentinelZero);

if (X86ISD::VSHLI == Opcode) {

- for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)

+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

Mask[i + j] = i + j - ByteShift;

} else {

- for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)

+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

Mask[i + j - ByteShift] = i + j;

}

return true;

}

+ case X86ISD::VROTLI:

+ case X86ISD::VROTRI: {

+ // We can only decode 'whole byte' bit rotates as shuffles.

+ uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

+ if ((RotateVal % 8) != 0)

+ return false;

+ Ops.push_back(N.getOperand(0));

+ int Offset = RotateVal / 8;

+ Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

+ for (int i = 0; i != (int)NumElts; ++i) {

+ int BaseIdx = i * NumBytesPerElt;

+ for (int j = 0; j != (int)NumBytesPerElt; ++j) {

+ Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

+ }

+ return true;

+ }

case X86ISD::VBROADCAST: {

SDValue Src = N.getOperand(0);

- MVT SrcVT = Src.getSimpleValueType();

- if (!SrcVT.isVector())

+ if (!Src.getSimpleValueType().isVector())

return false;

- if (NumSizeInBits != SrcVT.getSizeInBits()) {

- assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&

- "Illegal broadcast type");

- SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

- NumSizeInBits / SrcVT.getScalarSizeInBits());

- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,

- DAG.getUNDEF(SrcVT), Src,

- DAG.getIntPtrConstant(0, SDLoc(N)));

- }

Ops.push_back(Src);

Mask.append(NumElts, 0);

return true;

@@ -7476,22 +7714,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

(SrcVT.getScalarSizeInBits() % 8) != 0)

return false;

- unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();

bool IsAnyExtend =

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

- DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,

- Mask);

- if (NumSizeInBits != SrcVT.getSizeInBits()) {

- assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&

- "Illegal zero-extension type");

- SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),

- NumSizeInBits / NumSrcBitsPerElt);

- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,

- DAG.getUNDEF(SrcVT), Src,

- DAG.getIntPtrConstant(0, SDLoc(N)));

- }

+ DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

+ IsAnyExtend, Mask);

Ops.push_back(Src);

return true;

}

@@ -7549,7 +7775,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

SmallVectorImpl<SDValue> &Inputs,

SmallVectorImpl<int> &Mask,

APInt &KnownUndef, APInt &KnownZero,

- SelectionDAG &DAG, unsigned Depth,

+ const SelectionDAG &DAG, unsigned Depth,

bool ResolveKnownElts) {

EVT VT = Op.getValueType();

if (!VT.isSimple() || !VT.isVector())

@@ -7570,7 +7796,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

SmallVectorImpl<int> &Mask,

- SelectionDAG &DAG, unsigned Depth = 0,

+ const SelectionDAG &DAG, unsigned Depth = 0,

bool ResolveKnownElts = true) {

EVT VT = Op.getValueType();

if (!VT.isSimple() || !VT.isVector())

@@ -7583,93 +7809,107 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

KnownZero, DAG, Depth, ResolveKnownElts);

}

-/// Returns the scalar element that will make up the ith

+/// Returns the scalar element that will make up the i'th

/// element of the result of the vector shuffle.

-static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,

- unsigned Depth) {

- if (Depth == 6)

- return SDValue(); // Limit search depth.

+static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

+ SelectionDAG &DAG, unsigned Depth) {

+ if (Depth >= SelectionDAG::MaxRecursionDepth)

+ return SDValue(); // Limit search depth.

- SDValue V = SDValue(N, 0);

- EVT VT = V.getValueType();

- unsigned Opcode = V.getOpcode();

+ EVT VT = Op.getValueType();

+ unsigned Opcode = Op.getOpcode();

+ unsigned NumElems = VT.getVectorNumElements();

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

- if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {

+ if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

int Elt = SV->getMaskElt(Index);

if (Elt < 0)

return DAG.getUNDEF(VT.getVectorElementType());

- unsigned NumElems = VT.getVectorNumElements();

- SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)

- : SV->getOperand(1);

- return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);

+ SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

}

// Recurse into target specific vector shuffles to find scalars.

if (isTargetShuffle(Opcode)) {

- MVT ShufVT = V.getSimpleValueType();

+ MVT ShufVT = VT.getSimpleVT();

MVT ShufSVT = ShufVT.getVectorElementType();

int NumElems = (int)ShufVT.getVectorNumElements();

SmallVector<int, 16> ShuffleMask;

SmallVector<SDValue, 16> ShuffleOps;

bool IsUnary;

- if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))

+ if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

+ ShuffleMask, IsUnary))

return SDValue();

int Elt = ShuffleMask[Index];

if (Elt == SM_SentinelZero)

- return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)

- : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);

+ return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

+ : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

if (Elt == SM_SentinelUndef)

return DAG.getUNDEF(ShufSVT);

- assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");

- SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

- return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,

- Depth+1);

+ assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");

+ SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

}

// Recurse into insert_subvector base/sub vector to find scalars.

- if (Opcode == ISD::INSERT_SUBVECTOR &&

- isa<ConstantSDNode>(N->getOperand(2))) {

- SDValue Vec = N->getOperand(0);

- SDValue Sub = N->getOperand(1);

- EVT SubVT = Sub.getValueType();

- unsigned NumSubElts = SubVT.getVectorNumElements();

- uint64_t SubIdx = N->getConstantOperandVal(2);

+ if (Opcode == ISD::INSERT_SUBVECTOR) {

+ SDValue Vec = Op.getOperand(0);

+ SDValue Sub = Op.getOperand(1);

+ uint64_t SubIdx = Op.getConstantOperandVal(2);

+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

- return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);

- return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);

+ return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

+ return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

+ }

+ // Recurse into concat_vectors sub vector to find scalars.

+ if (Opcode == ISD::CONCAT_VECTORS) {

+ EVT SubVT = Op.getOperand(0).getValueType();

+ unsigned NumSubElts = SubVT.getVectorNumElements();

+ uint64_t SubIdx = Index / NumSubElts;

+ uint64_t SubElt = Index % NumSubElts;

+ return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

}

// Recurse into extract_subvector src vector to find scalars.

- if (Opcode == ISD::EXTRACT_SUBVECTOR &&

- isa<ConstantSDNode>(N->getOperand(1))) {

- SDValue Src = N->getOperand(0);

- uint64_t SrcIdx = N->getConstantOperandVal(1);

- return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);

+ if (Opcode == ISD::EXTRACT_SUBVECTOR) {

+ SDValue Src = Op.getOperand(0);

+ uint64_t SrcIdx = Op.getConstantOperandVal(1);

+ return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

}

- // Actual nodes that may contain scalar elements

+ // We only peek through bitcasts of the same vector width.

if (Opcode == ISD::BITCAST) {

- V = V.getOperand(0);

- EVT SrcVT = V.getValueType();

- unsigned NumElems = VT.getVectorNumElements();

+ SDValue Src = Op.getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

+ return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

+ return SDValue();

+ }

- if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)

- return SDValue();

+ // Actual nodes that may contain scalar elements

+ // For insert_vector_elt - either return the index matching scalar or recurse

+ // into the base vector.

+ if (Opcode == ISD::INSERT_VECTOR_ELT &&

+ isa<ConstantSDNode>(Op.getOperand(2))) {

+ if (Op.getConstantOperandAPInt(2) == Index)

+ return Op.getOperand(1);

+ return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

}

- if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)

- return (Index == 0) ? V.getOperand(0)

+ if (Opcode == ISD::SCALAR_TO_VECTOR)

+ return (Index == 0) ? Op.getOperand(0)

: DAG.getUNDEF(VT.getVectorElementType());

- if (V.getOpcode() == ISD::BUILD_VECTOR)

- return V.getOperand(Index);

+ if (Opcode == ISD::BUILD_VECTOR)

+ return Op.getOperand(Index);

return SDValue();

}

@@ -7762,10 +8002,11 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,

Elt = NextElt;

}

- // If our first insertion is not the first index then insert into zero

- // vector to break any register dependency else use SCALAR_TO_VECTOR.

+ // If our first insertion is not the first index or zeros are needed, then

+ // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

+ // elements undefined).

if (!V) {

- if (i != 0)

+ if (i != 0 || NumZero)

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

else {

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

@@ -7964,11 +8205,12 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

// FIXME: 256-bit vector instructions don't require a strict alignment,

// improve this code to support it better.

- unsigned RequiredAlign = VT.getSizeInBits()/8;

+ Align RequiredAlign(VT.getSizeInBits() / 8);

SDValue Chain = LD->getChain();

// Make sure the stack object alignment is at least 16 or 32.

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

- if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {

+ MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

+ if (!InferredAlign || *InferredAlign < RequiredAlign) {

if (MFI.isFixedObjectIndex(FI)) {

// Can't change the alignment. FIXME: It's possible to compute

// the exact stack offset and reference FI + adjust offset instead.

@@ -7983,9 +8225,9 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

// Ptr + (Offset & ~15).

if (Offset < 0)

return SDValue();

- if ((Offset % RequiredAlign) & 3)

+ if ((Offset % RequiredAlign.value()) & 3)

return SDValue();

- int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);

+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

if (StartOffset) {

SDLoc DL(Ptr);

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

@@ -8024,8 +8266,8 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

case ISD::SCALAR_TO_VECTOR:

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

case ISD::SRL:

- if (isa<ConstantSDNode>(Elt.getOperand(1))) {

- uint64_t Idx = Elt.getConstantOperandVal(1);

+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

+ uint64_t Idx = IdxC->getZExtValue();

if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

ByteOffset += Idx / 8;

return true;

@@ -8033,13 +8275,13 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

}

break;

case ISD::EXTRACT_VECTOR_ELT:

- if (isa<ConstantSDNode>(Elt.getOperand(1))) {

+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

SDValue Src = Elt.getOperand(0);

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

findEltLoadSrc(Src, Ld, ByteOffset)) {

- uint64_t Idx = Elt.getConstantOperandVal(1);

+ uint64_t Idx = IdxC->getZExtValue();

ByteOffset += Idx * (SrcSizeInBits / 8);

return true;

}

@@ -8169,7 +8411,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

"Cannot merge volatile or atomic loads.");

SDValue NewLd =

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

- LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);

+ LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

+ MMOFlags);

for (auto *LD : Loads)

if (LD)

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

@@ -8247,14 +8490,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

: MVT::getIntegerVT(LoadSizeInBits);

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

+ // Allow v4f32 on SSE1 only targets.

+ // FIXME: Add more isel patterns so we can just use VT directly.

+ if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

+ VecVT = MVT::v4f32;

if (TLI.isTypeLegal(VecVT)) {

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

- SDValue ResNode =

- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,

- LDBase->getPointerInfo(),

- LDBase->getAlignment(),

- MachineMemOperand::MOLoad);

+ SDValue ResNode = DAG.getMemIntrinsicNode(

+ X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

+ LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

for (auto *LD : Loads)

if (LD)

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

@@ -8318,13 +8563,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

// are consecutive, non-overlapping, and in the right order.

-static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,

+static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

SelectionDAG &DAG,

const X86Subtarget &Subtarget,

bool isAfterLegalize) {

SmallVector<SDValue, 64> Elts;

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

- if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {

+ if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

Elts.push_back(Elt);

continue;

}

@@ -8439,7 +8684,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

SDValue Ld = BVOp->getSplatValue(&UndefElements);

// Attempt to use VBROADCASTM

- // From this paterrn:

+ // From this pattern:

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

// b. t1 = (build_vector t0 t0)

@@ -8486,8 +8731,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

LLVMContext *Ctx = DAG.getContext();

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

if (Subtarget.hasAVX()) {

- if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&

- !(SplatBitSize == 64 && Subtarget.is32Bit())) {

+ if (SplatBitSize == 32 || SplatBitSize == 64 ||

+ (SplatBitSize < 32 && Subtarget.hasAVX2())) {

// Splatted value can fit in one INTEGER constant in constant pool.

// Load the constant and broadcast it.

MVT CVT = MVT::getIntegerVT(SplatBitSize);

@@ -8496,46 +8741,25 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

SDValue CP = DAG.getConstantPool(C, PVT);

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

- Ld = DAG.getLoad(

- CVT, dl, DAG.getEntryNode(), CP,

- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

- Alignment);

- SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,

- MVT::getVectorVT(CVT, Repeat), Ld);

- return DAG.getBitcast(VT, Brdcst);

- } else if (SplatBitSize == 32 || SplatBitSize == 64) {

- // Splatted value can fit in one FLOAT constant in constant pool.

- // Load the constant and broadcast it.

- // AVX have support for 32 and 64 bit broadcast for floats only.

- // No 64bit integer in 32bit subtarget.

- MVT CVT = MVT::getFloatingPointVT(SplatBitSize);

- // Lower the splat via APFloat directly, to avoid any conversion.

- Constant *C =

- SplatBitSize == 32

- ? ConstantFP::get(*Ctx,

- APFloat(APFloat::IEEEsingle(), SplatValue))

- : ConstantFP::get(*Ctx,

- APFloat(APFloat::IEEEdouble(), SplatValue));

- SDValue CP = DAG.getConstantPool(C, PVT);

- unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

- Ld = DAG.getLoad(

- CVT, dl, DAG.getEntryNode(), CP,

- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

- Alignment);

- SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,

- MVT::getVectorVT(CVT, Repeat), Ld);

+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

+ SDVTList Tys =

+ DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

+ SDValue Ops[] = {DAG.getEntryNode(), CP};

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

+ SDValue Brdcst = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

+ MachineMemOperand::MOLoad);

return DAG.getBitcast(VT, Brdcst);

- } else if (SplatBitSize > 64) {

+ }

+ if (SplatBitSize > 64) {

// Load the vector of constants and broadcast it.

MVT CVT = VT.getScalarType();

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

*Ctx);

SDValue VCP = DAG.getConstantPool(VecC, PVT);

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

- unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();

+ Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

Ld = DAG.getLoad(

MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

@@ -8560,10 +8784,12 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

bool ConstSplatVal =

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());

// Make sure that all of the users of a non-constant load are from the

// BUILD_VECTOR node.

- if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))

+ // FIXME: Is the use count needed for non-constant, non-load case?

+ if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

return SDValue();

unsigned ScalarSize = Ld.getValueSizeInBits();

@@ -8603,18 +8829,17 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

SDValue CP =

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

- Ld = DAG.getLoad(

- CVT, dl, DAG.getEntryNode(), CP,

- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

- Alignment);

+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {DAG.getEntryNode(), CP};

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

+ return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

+ MPI, Alignment, MachineMemOperand::MOLoad);

}

- bool IsLoad = ISD::isNormalLoad(Ld.getNode());

// Handle AVX2 in-register broadcasts.

if (!IsLoad && Subtarget.hasInt256() &&

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

@@ -8624,15 +8849,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

if (!IsLoad)

return SDValue();

+ // Make sure the non-chain result is only used by this build vector.

+ if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

+ return SDValue();

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

- (Subtarget.hasVLX() && ScalarSize == 64))

- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

+ (Subtarget.hasVLX() && ScalarSize == 64)) {

+ auto *LN = cast<LoadSDNode>(Ld);

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

+ SDValue BCast =

+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

+ LN->getMemoryVT(), LN->getMemOperand());

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

+ return BCast;

+ }

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

// double since there is no vbroadcastsd xmm

- if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {

- if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)

- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

+ if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

+ (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

+ auto *LN = cast<LoadSDNode>(Ld);

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

+ SDValue BCast =

+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

+ LN->getMemoryVT(), LN->getMemOperand());

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

+ return BCast;

}

// Unsupported broadcast.

@@ -8746,20 +8990,6 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

return NV;

}

-static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {

- assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&

- Op.getScalarValueSizeInBits() == 1 &&

- "Can not convert non-constant vector");

- uint64_t Immediate = 0;

- for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

- SDValue In = Op.getOperand(idx);

- if (!In.isUndef())

- Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;

- }

- SDLoc dl(Op);

- MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));

- return DAG.getConstant(Immediate, dl, VT);

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

@@ -8782,11 +9012,11 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

SDValue In = Op.getOperand(idx);

if (In.isUndef())

continue;

- if (!isa<ConstantSDNode>(In))

- NonConstIdx.push_back(idx);

- else {

- Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;

+ if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

+ Immediate |= (InC->getZExtValue() & 0x1) << idx;

HasConstElts = true;

+ } else {

+ NonConstIdx.push_back(idx);

}

if (SplatIdx < 0)

SplatIdx = idx;

@@ -8805,9 +9035,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

if (Cond.getOpcode() != ISD::SETCC)

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

DAG.getConstant(1, dl, MVT::i8));

- return DAG.getSelect(dl, VT, Cond,

- DAG.getConstant(1, dl, VT),

- DAG.getConstant(0, dl, VT));

+ // Perform the select in the scalar domain so we can use cmov.

+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

+ SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

+ DAG.getAllOnesConstant(dl, MVT::i32),

+ DAG.getConstant(0, dl, MVT::i32));

+ Select = DAG.getBitcast(MVT::v32i1, Select);

+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

+ } else {

+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

+ SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

+ DAG.getAllOnesConstant(dl, ImmVT),

+ DAG.getConstant(0, dl, ImmVT));

+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

+ Select = DAG.getBitcast(VecVT, Select);

+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

+ DAG.getIntPtrConstant(0, dl));

+ }

}

// insert elements one by one

@@ -8907,8 +9152,8 @@ static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

if (!CanFold)

break;

- unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();

- unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

+ unsigned I0 = Op0.getConstantOperandVal(1);

+ unsigned I1 = Op1.getConstantOperandVal(1);

if (i * 2 < NumElts) {

if (V0.isUndef()) {

@@ -9056,11 +9301,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

- !isa<ConstantSDNode>(Op1.getOperand(1)) ||

Op0.getOperand(1) != Op1.getOperand(1))

return false;

- unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();

+ unsigned I0 = Op0.getConstantOperandVal(1);

if (I0 != i)

return false;

@@ -9445,6 +9689,9 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

return SDValue();

}

+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

+ SelectionDAG &DAG);

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

/// just apply the bit to the vectors.

@@ -9452,6 +9699,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

/// from this, but enough scalar bit operations are created from the later

/// legalization + scalarization stages to need basic support.

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

+ const X86Subtarget &Subtarget,

SelectionDAG &DAG) {

SDLoc DL(Op);

MVT VT = Op->getSimpleValueType(0);

@@ -9515,7 +9763,14 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

- return DAG.getNode(Opcode, DL, VT, LHS, RHS);

+ SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

+ if (!IsShift)

+ return Res;

+ // Immediately lower the shift to ensure the constant build vector doesn't

+ // get converted to a constant pool before the shift is lowered.

+ return LowerShift(Res, Subtarget, DAG);

}

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

@@ -9571,9 +9826,11 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

Subtarget, DAG, SDLoc(IndicesVec));

- return extractSubVector(

- createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,

- DAG, DL, SizeInBits);

+ SDValue NewSrcVec =

+ createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

+ if (NewSrcVec)

+ return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

+ return SDValue();

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

// Widen smaller SrcVec to match VT.

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

@@ -9869,7 +10126,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

return HorizontalOp;

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

return Broadcast;

- if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))

+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

return BitOp;

unsigned EVTBits = EltVT.getSizeInBits();

@@ -9929,7 +10186,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

assert(!VarElt.getNode() && !InsIndex.getNode() &&

"Expected one variable element in this vector");

VarElt = Elt;

- InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));

+ InsIndex = DAG.getVectorIdxConstant(i, dl);

}

Constant *CV = ConstantVector::get(ConstVecOps);

@@ -10929,6 +11186,71 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

return SDValue();

}

+/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

+/// followed by unpack 256-bit.

+static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

+ ArrayRef<int> Mask, SDValue V1,

+ SDValue V2, SelectionDAG &DAG) {

+ SmallVector<int, 32> Unpckl, Unpckh;

+ createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

+ createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

+ unsigned UnpackOpcode;

+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))

+ UnpackOpcode = X86ISD::UNPCKL;

+ else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))

+ UnpackOpcode = X86ISD::UNPCKH;

+ else

+ return SDValue();

+ // This is a "natural" unpack operation (rather than the 128-bit sectored

+ // operation implemented by AVX). We need to rearrange 64-bit chunks of the

+ // input in order to use the x86 instruction.

+ V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

+ DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

+ V1 = DAG.getBitcast(VT, V1);

+ return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

+// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

+// source into the lower elements and zeroing the upper elements.

+// TODO: Merge with matchShuffleAsVPMOV.

+static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

+ ArrayRef<int> Mask, const APInt &Zeroable,

+ const X86Subtarget &Subtarget) {

+ if (!VT.is512BitVector() && !Subtarget.hasVLX())

+ return false;

+ unsigned NumElts = Mask.size();

+ unsigned EltSizeInBits = VT.getScalarSizeInBits();

+ unsigned MaxScale = 64 / EltSizeInBits;

+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

+ unsigned SrcEltBits = EltSizeInBits * Scale;

+ if (SrcEltBits < 32 && !Subtarget.hasBWI())

+ continue;

+ unsigned NumSrcElts = NumElts / Scale;

+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

+ continue;

+ unsigned UpperElts = NumElts - NumSrcElts;

+ if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())

+ continue;

+ SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

+ SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

+ DstVT = MVT::getIntegerVT(EltSizeInBits);

+ if ((NumSrcElts * EltSizeInBits) >= 128) {

+ // ISD::TRUNCATE

+ DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

+ } else {

+ // X86ISD::VTRUNC

+ DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

+ }

+ return true;

+ }

+ return false;

static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,

int Delta) {

int Size = (int)Mask.size();

@@ -11022,22 +11344,93 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,

return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);

}

+/// Check whether a compaction lowering can be done by dropping even

+/// elements and compute how many times even elements must be dropped.

+///

+/// This handles shuffles which take every Nth element where N is a power of

+/// two. Example shuffle masks:

+///

+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

+/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

+/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

+/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

+/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

+///

+/// Any of these lanes can of course be undef.

+///

+/// This routine only supports N <= 3.

+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

+/// for larger N.

+///

+/// \returns N above, or the number of times even elements must be dropped if

+/// there is such a number. Otherwise returns zero.

+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,

+ bool IsSingleInput) {

+ // The modulus for the shuffle vector entries is based on whether this is

+ // a single input or not.

+ int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

+ assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&

+ "We should only be called with masks with a power-of-2 size!");

+ uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

+ // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

+ // and 2^3 simultaneously. This is because we may have ambiguity with

+ // partially undef inputs.

+ bool ViableForN[3] = {true, true, true};

+ for (int i = 0, e = Mask.size(); i < e; ++i) {

+ // Ignore undef lanes, we'll optimistically collapse them to the pattern we

+ // want.

+ if (Mask[i] < 0)

+ continue;

+ bool IsAnyViable = false;

+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

+ if (ViableForN[j]) {

+ uint64_t N = j + 1;

+ // The shuffle mask must be equal to (i * 2^N) % M.

+ if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))

+ IsAnyViable = true;

+ else

+ ViableForN[j] = false;

+ }

+ // Early exit if we exhaust the possible powers of two.

+ if (!IsAnyViable)

+ break;

+ }

+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

+ if (ViableForN[j])

+ return j + 1;

+ // Return 0 as there is no viable power of two.

+ return 0;

// X86 has dedicated pack instructions that can handle specific truncation

// operations: PACKSS and PACKUS.

+// Checks for compaction shuffle masks if MaxStages > 1.

+// TODO: Add support for matching multiple PACKSS/PACKUS stages.

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

unsigned &PackOpcode, ArrayRef<int> TargetMask,

SelectionDAG &DAG,

- const X86Subtarget &Subtarget) {

+ const X86Subtarget &Subtarget,

+ unsigned MaxStages = 1) {

unsigned NumElts = VT.getVectorNumElements();

unsigned BitSize = VT.getScalarSizeInBits();

- MVT PackSVT = MVT::getIntegerVT(BitSize * 2);

- MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

+ assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&

+ "Illegal maximum compaction");

- auto MatchPACK = [&](SDValue N1, SDValue N2) {

+ auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

+ unsigned NumSrcBits = PackVT.getScalarSizeInBits();

+ unsigned NumPackedBits = NumSrcBits - BitSize;

SDValue VV1 = DAG.getBitcast(PackVT, N1);

SDValue VV2 = DAG.getBitcast(PackVT, N2);

- if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {

- APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);

+ if (Subtarget.hasSSE41() || BitSize == 8) {

+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&

(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {

V1 = VV1;

@@ -11047,8 +11440,8 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

return true;

}

- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&

- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {

+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&

+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {

V1 = VV1;

V2 = VV2;

SrcVT = PackVT;

@@ -11058,19 +11451,25 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

return false;

};

- // Try binary shuffle.

- SmallVector<int, 32> BinaryMask;

- createPackShuffleMask(VT, BinaryMask, false);

- if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))

- if (MatchPACK(V1, V2))

- return true;

+ // Attempt to match against wider and wider compaction patterns.

+ for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

+ MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

+ MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

- // Try unary shuffle.

- SmallVector<int, 32> UnaryMask;

- createPackShuffleMask(VT, UnaryMask, true);

- if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))

- if (MatchPACK(V1, V1))

- return true;

+ // Try binary shuffle.

+ SmallVector<int, 32> BinaryMask;

+ createPackShuffleMask(VT, BinaryMask, false, NumStages);

+ if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))

+ if (MatchPACK(V1, V2, PackVT))

+ return true;

+ // Try unary shuffle.

+ SmallVector<int, 32> UnaryMask;

+ createPackShuffleMask(VT, UnaryMask, true, NumStages);

+ if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))

+ if (MatchPACK(V1, V1, PackVT))

+ return true;

+ }

return false;

}

@@ -11080,12 +11479,44 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

const X86Subtarget &Subtarget) {

MVT PackVT;

unsigned PackOpcode;

- if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

- Subtarget))

- return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),

- DAG.getBitcast(PackVT, V2));

+ unsigned SizeBits = VT.getSizeInBits();

+ unsigned EltBits = VT.getScalarSizeInBits();

+ unsigned MaxStages = Log2_32(64 / EltBits);

+ if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

+ Subtarget, MaxStages))

+ return SDValue();

- return SDValue();

+ unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

+ unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

+ // Don't lower multi-stage packs on AVX512, truncation is better.

+ if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

+ return SDValue();

+ // Pack to the largest type possible:

+ // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

+ unsigned MaxPackBits = 16;

+ if (CurrentEltBits > 16 &&

+ (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

+ MaxPackBits = 32;

+ // Repeatedly pack down to the target size.

+ SDValue Res;

+ for (unsigned i = 0; i != NumStages; ++i) {

+ unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

+ unsigned NumSrcElts = SizeBits / SrcEltBits;

+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

+ MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

+ MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

+ Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

+ DAG.getBitcast(SrcVT, V2));

+ V1 = V2 = Res;

+ CurrentEltBits /= 2;

+ }

+ assert(Res && Res.getValueType() == VT &&

+ "Failed to lower compaction shuffle");

+ return Res;

}

/// Try to emit a bitmask instruction for a shuffle.

@@ -11109,8 +11540,9 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

MVT LogicVT = VT;

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

Zero = DAG.getConstantFP(0.0, DL, EltVT);

- AllOnes = DAG.getConstantFP(

- APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);

+ APFloat AllOnesValue = APFloat::getAllOnesValue(

+ SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());

+ AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

LogicVT =

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

} else {

@@ -11312,6 +11744,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

}

+ // If we have VPTERNLOG, we can use that as a bit blend.

+ if (Subtarget.hasVLX())

+ if (SDValue BitBlend =

+ lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

+ return BitBlend;

// Scale the blend by the number of bytes per element.

int Scale = VT.getScalarSizeInBits() / 8;

@@ -11622,10 +12060,101 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(

return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

}

-/// Try to lower a vector shuffle as a rotation.

+/// Try to lower a vector shuffle as a bit rotation.

+///

+/// Look for a repeated rotation pattern in each sub group.

+/// Returns a ISD::ROTL element rotation amount or -1 if failed.

+static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

+ int NumElts = Mask.size();

+ assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");

+ int RotateAmt = -1;

+ for (int i = 0; i != NumElts; i += NumSubElts) {

+ for (int j = 0; j != NumSubElts; ++j) {

+ int M = Mask[i + j];

+ if (M < 0)

+ continue;

+ if (!isInRange(M, i, i + NumSubElts))

+ return -1;

+ int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

+ if (0 <= RotateAmt && Offset != RotateAmt)

+ return -1;

+ RotateAmt = Offset;

+ }

+ return RotateAmt;

+static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

+ const X86Subtarget &Subtarget,

+ ArrayRef<int> Mask) {

+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

+ assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");

+ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

+ int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

+ int MaxSubElts = 64 / EltSizeInBits;

+ for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

+ int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

+ if (RotateAmt < 0)

+ continue;

+ int NumElts = Mask.size();

+ MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

+ RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

+ return RotateAmt * EltSizeInBits;

+ }

+ return -1;

+/// Lower shuffle using X86ISD::VROTLI rotations.

+static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

+ ArrayRef<int> Mask,

+ const X86Subtarget &Subtarget,

+ SelectionDAG &DAG) {

+ // Only XOP + AVX512 targets have bit rotation instructions.

+ // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

+ bool IsLegal =

+ (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

+ if (!IsLegal && Subtarget.hasSSE3())

+ return SDValue();

+ MVT RotateVT;

+ int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

+ Subtarget, Mask);

+ if (RotateAmt < 0)

+ return SDValue();

+ // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

+ // expanded to OR(SRL,SHL), will be more efficient, but if they can

+ // widen to vXi16 or more then existing lowering should will be better.

+ if (!IsLegal) {

+ if ((RotateAmt % 16) == 0)

+ return SDValue();

+ // TODO: Use getTargetVShiftByConstNode.

+ unsigned ShlAmt = RotateAmt;

+ unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

+ V1 = DAG.getBitcast(RotateVT, V1);

+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

+ DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

+ DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

+ SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

+ return DAG.getBitcast(VT, Rot);

+ }

+ SDValue Rot =

+ DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

+ return DAG.getBitcast(VT, Rot);

+/// Try to match a vector shuffle as an element rotation.

///

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

-static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {

+static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

+ ArrayRef<int> Mask) {

int NumElts = Mask.size();

// We need to detect various ways of spelling a rotation:

@@ -11712,7 +12241,7 @@ static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

ArrayRef<int> Mask) {

// Don't accept any shuffles with zero elements.

- if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))

+ if (isAnyZero(Mask))

return -1;

// PALIGNR works on 128-bit lanes.

@@ -11720,7 +12249,7 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

return -1;

- int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);

+ int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

if (Rotation <= 0)

return -1;

@@ -11788,7 +12317,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

/// elements, and takes the low elements as the result. Note that while this is

/// specified as a *right shift* because x86 is little-endian, it is a *left

/// rotate* of the vector lanes.

-static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,

+static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

SDValue V2, ArrayRef<int> Mask,

const X86Subtarget &Subtarget,

SelectionDAG &DAG) {

@@ -11800,7 +12329,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,

&& "VLX required for 128/256-bit vectors");

SDValue Lo = V1, Hi = V2;

- int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);

+ int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

if (Rotation <= 0)

return SDValue();

@@ -12566,13 +13095,13 @@ static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

assert(Subtarget.hasAVX2() &&

"We can only lower integer broadcasts with AVX2!");

- EVT EltVT = VT.getVectorElementType();

- EVT V0VT = V0.getValueType();

+ MVT EltVT = VT.getVectorElementType();

+ MVT V0VT = V0.getSimpleValueType();

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

- EVT V0EltVT = V0VT.getVectorElementType();

+ MVT V0EltVT = V0VT.getVectorElementType();

if (!V0EltVT.isInteger())

return SDValue();

@@ -12636,7 +13165,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

SDValue N1, ArrayRef<int> Mask,

SelectionDAG &DAG) {

- EVT VT = N0.getValueType();

+ MVT VT = N0.getSimpleValueType();

assert((VT.is128BitVector() &&

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&

"VPERM* family of shuffles requires 32-bit or 64-bit elements");

@@ -12649,9 +13178,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

return SDValue();

SDValue WideVec = N0.getOperand(0);

- EVT WideVT = WideVec.getValueType();

- if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||

- !isa<ConstantSDNode>(N1.getOperand(1)))

+ MVT WideVT = WideVec.getSimpleValueType();

+ if (!WideVT.is256BitVector())

return SDValue();

// Match extracts of each half of the wide source vector. Commute the shuffle

@@ -12699,7 +13227,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

// we can only broadcast from a register with AVX2.

- unsigned NumElts = Mask.size();

unsigned NumEltBits = VT.getScalarSizeInBits();

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

? X86ISD::MOVDDUP

@@ -12707,15 +13234,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

// Check that the mask is a broadcast.

- int BroadcastIdx = -1;

- for (int i = 0; i != (int)NumElts; ++i) {

- SmallVector<int, 8> BroadcastMask(NumElts, i);

- if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {

- BroadcastIdx = i;

- break;

- }

+ int BroadcastIdx = getSplatIndex(Mask);

if (BroadcastIdx < 0)

return SDValue();

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "

@@ -12724,6 +13243,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

// Go up the chain of (vector) values to find a scalar load that we can

// combine with the broadcast.

+ // TODO: Combine this logic with findEltLoadSrc() used by

+ // EltsFromConsecutiveLoads().

int BitOffset = BroadcastIdx * NumEltBits;

SDValue V = V1;

for (;;) {

@@ -12739,14 +13260,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

BitOffset %= OpBitWidth;

continue;

}

+ case ISD::EXTRACT_SUBVECTOR: {

+ // The extraction index adds to the existing offset.

+ unsigned EltBitWidth = V.getScalarValueSizeInBits();

+ unsigned Idx = V.getConstantOperandVal(1);

+ unsigned BeginOffset = Idx * EltBitWidth;

+ BitOffset += BeginOffset;

+ V = V.getOperand(0);

+ continue;

+ }

case ISD::INSERT_SUBVECTOR: {

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

- auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));

- if (!ConstantIdx)

- break;

int EltBitWidth = VOuter.getScalarValueSizeInBits();

- int Idx = (int)ConstantIdx->getZExtValue();

+ int Idx = (int)V.getConstantOperandVal(2);

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

int BeginOffset = Idx * EltBitWidth;

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

@@ -12777,8 +13303,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

DL, VT, V, BroadcastIdx, Subtarget, DAG))

return TruncBroadcast;

- MVT BroadcastVT = VT;

// Also check the simpler case, where we can directly reuse the scalar.

if (!BitCastSrc &&

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

@@ -12788,23 +13312,34 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

// If we can't broadcast from a register, check that the input is a load.

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

return SDValue();

- } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {

- // 32-bit targets need to load i64 as a f64 and then bitcast the result.

- if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {

- BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());

- Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())

- ? X86ISD::MOVDDUP

- : Opcode;

- }

+ } else if (ISD::isNormalLoad(V.getNode()) &&

+ cast<LoadSDNode>(V)->isSimple()) {

+ // We do not check for one-use of the vector load because a broadcast load

+ // is expected to be a win for code size, register pressure, and possibly

+ // uops even if the original vector load is not eliminated.

- // If we are broadcasting a load that is only used by the shuffle

- // then we can reduce the vector load to the broadcasted scalar load.

+ // Reduce the vector load and shuffle to a broadcasted scalar load.

LoadSDNode *Ld = cast<LoadSDNode>(V);

SDValue BaseAddr = Ld->getOperand(1);

- EVT SVT = BroadcastVT.getScalarType();

+ MVT SVT = VT.getScalarType();

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");

SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);

+ // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

+ // than MOVDDUP.

+ // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

+ if (Opcode == X86ISD::VBROADCAST) {

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {Ld->getChain(), NewAddr};

+ V = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

+ DAG.getMachineFunction().getMachineMemOperand(

+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));

+ DAG.makeEquivalentMemoryOrdering(Ld, V);

+ return DAG.getBitcast(VT, V);

+ }

+ assert(SVT == MVT::f64 && "Unexpected VT!");

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

DAG.getMachineFunction().getMachineMemOperand(

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

@@ -12839,38 +13374,26 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

DAG.getBitcast(MVT::f64, V));

- // Bitcast back to the same scalar type as BroadcastVT.

- if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {

- assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&

- "Unexpected vector element size");

- MVT ExtVT;

- if (V.getValueType().isVector()) {

- unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

- ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);

- } else {

- ExtVT = BroadcastVT.getScalarType();

- }

- V = DAG.getBitcast(ExtVT, V);

- }

- // 32-bit targets need to load i64 as a f64 and then bitcast the result.

- if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {

- V = DAG.getBitcast(MVT::f64, V);

- unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();

- BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);

+ // If this is a scalar, do the broadcast on this type and bitcast.

+ if (!V.getValueType().isVector()) {

+ assert(V.getScalarValueSizeInBits() == NumEltBits &&

+ "Unexpected scalar size");

+ MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

+ VT.getVectorNumElements());

+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

}

// We only support broadcasting from 128-bit vectors to minimize the

// number of patterns we need to deal with in isel. So extract down to

// 128-bits, removing as many bitcasts as possible.

- if (V.getValueSizeInBits() > 128) {

- MVT ExtVT = V.getSimpleValueType().getScalarType();

- ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());

+ if (V.getValueSizeInBits() > 128)

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

- V = DAG.getBitcast(ExtVT, V);

- }

- return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

+ // Otherwise cast V to a vector with the same element type as VT, but

+ // possibly narrower than VT. Then perform the broadcast.

+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

+ return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

}

// Check for whether we can use INSERTPS to perform the shuffle. We only use

@@ -13259,7 +13782,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

if (Subtarget.hasSSSE3()) {

if (Subtarget.hasVLX())

- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,

+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

Subtarget, DAG))

return Rotate;

@@ -13293,8 +13816,7 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

ArrayRef<int> Mask, SDValue V1,

SDValue V2, SelectionDAG &DAG) {

SDValue LowV = V1, HighV = V2;

- int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

if (NumV2Elements == 1) {

@@ -13548,7 +14070,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

if (Subtarget.hasSSSE3()) {

if (Subtarget.hasVLX())

- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,

+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

Subtarget, DAG))

return Rotate;

@@ -14186,6 +14708,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Mask, Subtarget, DAG))

return Broadcast;

+ // Try to use bit rotation instructions.

+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

+ Subtarget, DAG))

+ return Rotate;

// Use dedicated unpack instructions for masks that match their pattern.

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

return V;

@@ -14262,6 +14789,29 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Zeroable, Subtarget, DAG))

return V;

+ // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

+ // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

+ // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);

+ if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

+ !Subtarget.hasVLX()) {

+ SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));

+ for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

+ DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

+ SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

+ V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

+ DWordClearMask);

+ V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

+ DWordClearMask);

+ // Now pack things back together.

+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

+ if (NumEvenDrops == 2) {

+ Result = DAG.getBitcast(MVT::v4i32, Result);

+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

+ }

+ return Result;

+ }

// Try to lower by permuting the inputs into an unpack instruction.

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

Mask, Subtarget, DAG))

@@ -14281,72 +14831,6 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Mask, Subtarget, DAG);

}

-/// Check whether a compaction lowering can be done by dropping even

-/// elements and compute how many times even elements must be dropped.

-///

-/// This handles shuffles which take every Nth element where N is a power of

-/// two. Example shuffle masks:

-///

-/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

-/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

-/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

-/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

-/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

-/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

-///

-/// Any of these lanes can of course be undef.

-///

-/// This routine only supports N <= 3.

-/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

-/// for larger N.

-///

-/// \returns N above, or the number of times even elements must be dropped if

-/// there is such a number. Otherwise returns zero.

-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,

- bool IsSingleInput) {

- // The modulus for the shuffle vector entries is based on whether this is

- // a single input or not.

- int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

- assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&

- "We should only be called with masks with a power-of-2 size!");

- uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

- // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

- // and 2^3 simultaneously. This is because we may have ambiguity with

- // partially undef inputs.

- bool ViableForN[3] = {true, true, true};

- for (int i = 0, e = Mask.size(); i < e; ++i) {

- // Ignore undef lanes, we'll optimistically collapse them to the pattern we

- // want.

- if (Mask[i] < 0)

- continue;

- bool IsAnyViable = false;

- for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

- if (ViableForN[j]) {

- uint64_t N = j + 1;

- // The shuffle mask must be equal to (i * 2^N) % M.

- if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))

- IsAnyViable = true;

- else

- ViableForN[j] = false;

- }

- // Early exit if we exhaust the possible powers of two.

- if (!IsAnyViable)

- break;

- }

- for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

- if (ViableForN[j])

- return j + 1;

- // Return 0 as there is no viable power of two.

- return 0;

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

ArrayRef<int> Mask, SDValue V1,

SDValue V2, SelectionDAG &DAG) {

@@ -14410,6 +14894,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Mask, Subtarget, DAG))

return Broadcast;

+ // Try to use bit rotation instructions.

+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

+ Subtarget, DAG))

+ return Rotate;

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

return V;

@@ -14524,6 +15013,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Zeroable, Subtarget, DAG))

return V;

+ // Check for compaction patterns.

+ bool IsSingleInput = V2.isUndef();

+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

// with PSHUFB. It is important to do this before we attempt to generate any

// blends but after all of the single-input lowerings. If the single input

@@ -14534,10 +15027,13 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// and there are *very* few patterns that would actually be faster than the

// PSHUFB approach because of its ability to zero lanes.

+ // If the mask is a binary compaction, we can more efficiently perform this

+ // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

+ //

// FIXME: The only exceptions to the above are blends which are exact

// interleavings with direct instructions supporting them. We currently don't

// handle those well here.

- if (Subtarget.hasSSSE3()) {

+ if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

bool V1InUse = false;

bool V2InUse = false;

@@ -14595,8 +15091,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// We special case these as they can be particularly efficiently handled with

// the PACKUSB instruction on x86 and they show up in common patterns of

// rearranging bytes to truncate wide elements.

- bool IsSingleInput = V2.isUndef();

- if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {

+ if (NumEvenDrops) {

// NumEvenDrops is the power of two stride of the elements. Another way of

// thinking about it is that we need to drop the even elements this many

// times to get the original input.

@@ -14604,23 +15099,23 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// First we need to zero all the dropped bytes.

assert(NumEvenDrops <= 3 &&

"No support for dropping even elements more than 3 times.");

- SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));

- for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)

- ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);

- SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);

- V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);

+ SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

+ for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

+ WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

+ SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

+ V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

+ WordClearMask);

if (!IsSingleInput)

- V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

+ V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

+ WordClearMask);

// Now pack things back together.

- V1 = DAG.getBitcast(MVT::v8i16, V1);

- V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);

- SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);

+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

+ IsSingleInput ? V1 : V2);

for (int i = 1; i < NumEvenDrops; ++i) {

Result = DAG.getBitcast(MVT::v8i16, Result);

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

}

return Result;

}

@@ -14725,37 +15220,13 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

int NumElements = VT.getVectorNumElements();

int SplitNumElements = NumElements / 2;

MVT ScalarVT = VT.getVectorElementType();

- MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

+ MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

- // Rather than splitting build-vectors, just build two narrower build

- // vectors. This helps shuffling with splats and zeros.

+ // Use splitVector/extractSubVector so that split build-vectors just build two

+ // narrower build vectors. This helps shuffling with splats and zeros.

auto SplitVector = [&](SDValue V) {

- V = peekThroughBitcasts(V);

- MVT OrigVT = V.getSimpleValueType();

- int OrigNumElements = OrigVT.getVectorNumElements();

- int OrigSplitNumElements = OrigNumElements / 2;

- MVT OrigScalarVT = OrigVT.getVectorElementType();

- MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

SDValue LoV, HiV;

- auto *BV = dyn_cast<BuildVectorSDNode>(V);

- if (!BV) {

- LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,

- DAG.getIntPtrConstant(0, DL));

- HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,

- DAG.getIntPtrConstant(OrigSplitNumElements, DL));

- } else {

- SmallVector<SDValue, 16> LoOps, HiOps;

- for (int i = 0; i < OrigSplitNumElements; ++i) {

- LoOps.push_back(BV->getOperand(i));

- HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));

- }

- LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);

- HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);

- }

+ std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

DAG.getBitcast(SplitVT, HiV));

};

@@ -15963,7 +16434,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

SmallVector<int, 2> RepeatedMask;

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

SmallVector<int, 4> PSHUFDMask;

- scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);

+ narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

return DAG.getBitcast(

MVT::v4i64,

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

@@ -15984,7 +16455,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// If we have VLX support, we can use VALIGN or VEXPAND.

if (Subtarget.hasVLX()) {

- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,

+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

Subtarget, DAG))

return Rotate;

@@ -16085,13 +16556,14 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// If we have a single input shuffle with different shuffle patterns in the

// two 128-bit lanes use the variable mask to VPERMILPS.

if (V2.isUndef()) {

- SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

- if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))

+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

- if (Subtarget.hasAVX2())

+ }

+ if (Subtarget.hasAVX2()) {

+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

+ }

// Otherwise, fall back.

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

DAG, Subtarget);

@@ -16190,7 +16662,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// If we have VLX support, we can use VALIGN or EXPAND.

if (Subtarget.hasVLX()) {

- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,

+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

Subtarget, DAG))

return Rotate;

@@ -16210,9 +16682,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

return V;

- // If the shuffle patterns aren't repeated but it is a single input, directly

- // generate a cross-lane VPERMD instruction.

if (V2.isUndef()) {

+ // Try to produce a fixed cross-128-bit lane permute followed by unpack

+ // because that should be faster than the variable permute alternatives.

+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

+ return V;

+ // If the shuffle patterns aren't repeated but it's a single input, directly

+ // generate a cross-lane VPERMD instruction.

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

}

@@ -16294,6 +16771,16 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return V;

if (V2.isUndef()) {

+ // Try to use bit rotation instructions.

+ if (SDValue Rotate =

+ lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

+ return Rotate;

+ // Try to produce a fixed cross-128-bit lane permute followed by unpack

+ // because that should be faster than the variable permute alternatives.

+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

+ return V;

// There are no generalized cross-lane shuffle operations available on i16

// element types.

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

@@ -16379,7 +16866,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// Try to use shift instructions.

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,

- Zeroable, Subtarget, DAG))

+ Zeroable, Subtarget, DAG))

return Shift;

// Try to use byte rotation instructions.

@@ -16387,6 +16874,12 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Subtarget, DAG))

return Rotate;

+ // Try to use bit rotation instructions.

+ if (V2.isUndef())

+ if (SDValue Rotate =

+ lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

+ return Rotate;

// Try to create an in-lane repeating shuffle mask and then shuffle the

// results into the target lanes.

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

@@ -16396,6 +16889,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

// There are no generalized cross-lane shuffle operations available on i8

// element types.

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

+ // Try to produce a fixed cross-128-bit lane permute followed by unpack

+ // because that should be faster than the variable permute alternatives.

+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

+ return V;

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

return V;

@@ -16518,13 +17016,14 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

- SmallVector<int, 4> WidenedMask;

- if (!canWidenShuffleElements(Mask, WidenedMask))

+ SmallVector<int, 4> Widened128Mask;

+ if (!canWidenShuffleElements(Mask, Widened128Mask))

return SDValue();

+ assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");

// Try to use an insert into a zero vector.

- if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

- (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

+ if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

+ (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

@@ -16536,37 +17035,34 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

// Check for patterns which can be matched with a single insert of a 256-bit

// subvector.

- bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,

- {0, 1, 2, 3, 0, 1, 2, 3});

- if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,

- {0, 1, 2, 3, 8, 9, 10, 11})) {

+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});

+ if (OnlyUsesV1 ||

+ isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

- SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

- OnlyUsesV1 ? V1 : V2,

- DAG.getIntPtrConstant(0, DL));

+ SDValue SubVec =

+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

+ DAG.getIntPtrConstant(0, DL));

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

DAG.getIntPtrConstant(4, DL));

}

- assert(WidenedMask.size() == 4);

// See if this is an insertion of the lower 128-bits of V2 into V1.

bool IsInsert = true;

int V2Index = -1;

for (int i = 0; i < 4; ++i) {

- assert(WidenedMask[i] >= -1);

- if (WidenedMask[i] < 0)

+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");

+ if (Widened128Mask[i] < 0)

continue;

// Make sure all V1 subvectors are in place.

- if (WidenedMask[i] < 4) {

- if (WidenedMask[i] != i) {

+ if (Widened128Mask[i] < 4) {

+ if (Widened128Mask[i] != i) {

IsInsert = false;

break;

}

} else {

// Make sure we only have a single V2 index and its the lowest 128-bits.

- if (V2Index >= 0 || WidenedMask[i] != 4) {

+ if (V2Index >= 0 || Widened128Mask[i] != 4) {

IsInsert = false;

break;

}

@@ -16580,16 +17076,26 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

}

+ // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

+ // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

+ // possible we at least ensure the lanes stay sequential to help later

+ // combines.

+ SmallVector<int, 2> Widened256Mask;

+ if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

+ Widened128Mask.clear();

+ narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

+ }

// Try to lower to vshuf64x2/vshuf32x4.

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

unsigned PermMask = 0;

// Insure elements came from the same Op.

for (int i = 0; i < 4; ++i) {

- assert(WidenedMask[i] >= -1);

- if (WidenedMask[i] < 0)

+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");

+ if (Widened128Mask[i] < 0)

continue;

- SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;

+ SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

unsigned OpIndex = i / 2;

if (Ops[OpIndex].isUndef())

Ops[OpIndex] = Op;

@@ -16598,7 +17104,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

// Convert the 128-bit shuffle mask selection values into 128-bit selection

// bits defined by a vshuf64x2 instruction's immediate control byte.

- PermMask |= (WidenedMask[i] % 4) << (i * 2);

+ PermMask |= (Widened128Mask[i] % 4) << (i * 2);

}

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

@@ -16696,6 +17202,12 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

}

+ // Try to create an in-lane repeating shuffle mask and then shuffle the

+ // results into the target lanes.

+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

+ DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

+ return V;

// If we have a single input shuffle with different shuffle patterns in the

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

if (V2.isUndef() &&

@@ -16728,7 +17240,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

SmallVector<int, 2> Repeated128Mask;

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

SmallVector<int, 4> PSHUFDMask;

- scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);

+ narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

return DAG.getBitcast(

MVT::v8i64,

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

@@ -16752,7 +17264,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return Shift;

// Try to use VALIGN.

- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,

+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

Subtarget, DAG))

return Rotate;

@@ -16814,7 +17326,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return Shift;

// Try to use VALIGN.

- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,

+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

Subtarget, DAG))

return Rotate;

@@ -16833,6 +17345,13 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

CastV1, CastV2, DAG);

return DAG.getBitcast(MVT::v16i32, ShufPS);

}

+ // Try to create an in-lane repeating shuffle mask and then shuffle the

+ // results into the target lanes.

+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

+ DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

+ return V;

// If we have AVX512F support, we can use VEXPAND.

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

DAG, Subtarget))

@@ -16841,6 +17360,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

Zeroable, Subtarget, DAG))

return Blend;

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);

}

@@ -16865,6 +17385,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

return V;

+ // Use dedicated pack instructions for masks that match their pattern.

+ if (SDValue V =

+ lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

+ return V;

// Try to use shift instructions.

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,

Zeroable, Subtarget, DAG))

@@ -16876,18 +17401,23 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

return Rotate;

if (V2.isUndef()) {

+ // Try to use bit rotation instructions.

+ if (SDValue Rotate =

+ lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

+ return Rotate;

SmallVector<int, 8> RepeatedMask;

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

// As this is a single-input shuffle, the repeated mask should be

// a strictly valid v8i16 mask that we can pass through to the v8i16

// lowering to handle even the v32 case.

- return lowerV8I16GeneralSingleInputShuffle(

- DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);

+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

+ RepeatedMask, Subtarget, DAG);

}

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

- Zeroable, Subtarget, DAG))

+ Zeroable, Subtarget, DAG))

return Blend;

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

@@ -16933,6 +17463,17 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

Subtarget, DAG))

return Rotate;

+ // Try to use bit rotation instructions.

+ if (V2.isUndef())

+ if (SDValue Rotate =

+ lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

+ return Rotate;

+ // Lower as AND if possible.

+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

+ Zeroable, Subtarget, DAG))

+ return Masked;

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

Zeroable, Subtarget, DAG))

return PSHUFB;

@@ -16995,6 +17536,18 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

Subtarget, DAG))

return Broadcast;

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

+ // Try using bit ops for masking and blending before falling back to

+ // splitting.

+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

+ Subtarget, DAG))

+ return V;

+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

+ return V;

+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

+ }

// Dispatch to each element type for lowering. If we don't have support for

// specific element type shuffles at 512 bits, immediately split them and

// lower them. Each lowering routine of a given type is allowed to assume that

@@ -17477,6 +18030,10 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

unsigned EltSize = VT.getScalarSizeInBits();

unsigned NumElts = VT.getVectorNumElements();

+ // Expand v32i16/v64i8 without BWI.

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

+ return SDValue();

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

// into an i1 condition so that we can use the mask-based 512-bit blend

// instructions.

@@ -17532,14 +18089,24 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

MVT VT = Op.getSimpleValueType();

+ SDValue Vec = Op.getOperand(0);

+ SDValue Idx = Op.getOperand(1);

+ assert(isa<ConstantSDNode>(Idx) && "Constant index expected");

SDLoc dl(Op);

- if (!Op.getOperand(0).getSimpleValueType().is128BitVector())

+ if (!Vec.getSimpleValueType().is128BitVector())

return SDValue();

if (VT.getSizeInBits() == 8) {

- SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,

- Op.getOperand(0), Op.getOperand(1));

+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

+ // we're going to zero extend the register or fold the store.

+ if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&

+ !MayFoldIntoStore(Op))

+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

+ DAG.getBitcast(MVT::v4i32, Vec), Idx));

+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

}

@@ -17552,22 +18119,17 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

if (!Op.hasOneUse())

return SDValue();

SDNode *User = *Op.getNode()->use_begin();

- if ((User->getOpcode() != ISD::STORE ||

- isNullConstant(Op.getOperand(1))) &&

+ if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

(User->getOpcode() != ISD::BITCAST ||

User->getValueType(0) != MVT::i32))

return SDValue();

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

- DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),

- Op.getOperand(1));

+ DAG.getBitcast(MVT::v4i32, Vec), Idx);

return DAG.getBitcast(MVT::f32, Extract);

}

- if (VT == MVT::i32 || VT == MVT::i64) {

- // ExtractPS/pextrq works with constant index.

- if (isa<ConstantSDNode>(Op.getOperand(1)))

+ if (VT == MVT::i32 || VT == MVT::i64)

return Op;

- }

return SDValue();

}

@@ -17580,6 +18142,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

SDLoc dl(Vec);

MVT VecVT = Vec.getSimpleValueType();

SDValue Idx = Op.getOperand(1);

+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

MVT EltVT = Op.getSimpleValueType();

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&

@@ -17587,7 +18150,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

// variable index can't be handled in mask registers,

// extend vector to VR512/128

- if (!isa<ConstantSDNode>(Idx)) {

+ if (!IdxC) {

unsigned NumElts = VecVT.getVectorNumElements();

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

// than extending to 128/256bit.

@@ -17598,7 +18161,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

}

- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

+ unsigned IdxVal = IdxC->getZExtValue();

if (IdxVal == 0) // the operation is legal

return Op;

@@ -17627,11 +18190,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

SDValue Vec = Op.getOperand(0);

MVT VecVT = Vec.getSimpleValueType();

SDValue Idx = Op.getOperand(1);

+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

if (VecVT.getVectorElementType() == MVT::i1)

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

- if (!isa<ConstantSDNode>(Idx)) {

+ if (!IdxC) {

// Its more profitable to go through memory (1 cycles throughput)

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

// IACA tool was used to get performance estimation

@@ -17665,7 +18229,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

return SDValue();

}

- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

+ unsigned IdxVal = IdxC->getZExtValue();

// If this is a 256-bit vector result, first extract the 128-bit vector and

// then extract the element from the 128-bit vector.

@@ -17697,9 +18261,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

DAG.getBitcast(MVT::v4i32, Vec), Idx));

- // Transform it so it match pextrw which produces a 32-bit result.

- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,

- Op.getOperand(0), Op.getOperand(1));

+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

}

@@ -17789,9 +18351,7 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

// Copy into a k-register, extract to v1i1 and insert_subvector.

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,

- Op.getOperand(2));

+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

}

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

@@ -17864,11 +18424,22 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

// This will be just movd/movq/movss/movsd.

- if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&

- (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

- EltVT == MVT::i64)) {

- N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

- return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

+ EltVT == MVT::i64) {

+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

+ }

+ // We can't directly insert an i8 or i16 into a vector, so zero extend

+ // it to i32 first.

+ if (EltVT == MVT::i16 || EltVT == MVT::i8) {

+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);

+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

+ N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

+ return DAG.getBitcast(VT, N1);

+ }

}

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

@@ -17981,12 +18552,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

SDLoc dl(Op);

SDValue Vec = Op.getOperand(0);

- SDValue Idx = Op.getOperand(1);

- if (!isa<ConstantSDNode>(Idx))

- return SDValue();

+ uint64_t IdxVal = Op.getConstantOperandVal(1);

- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

if (IdxVal == 0) // the operation is legal

return Op;

@@ -18045,7 +18612,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

auto PtrVT = getPointerTy(DAG.getDataLayout());

SDValue Result = DAG.getTargetConstantPool(

- CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);

+ CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

SDLoc DL(CP);

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

// With PIC, the address is actually $g + Offset.

@@ -18554,25 +19121,47 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

Op0, Op1, Amt);

}

- assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&

- "Unexpected funnel shift type!");

+ assert(

+ (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&

+ "Unexpected funnel shift type!");

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

bool OptForSize = DAG.shouldOptForSize();

- if (!OptForSize && Subtarget.isSHLDSlow())

- return SDValue();

+ bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

- if (IsFSHR)

- std::swap(Op0, Op1);

+ // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

+ // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

+ if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

+ !isa<ConstantSDNode>(Amt)) {

+ unsigned EltSizeInBits = VT.getScalarSizeInBits();

+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

+ SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

+ Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

+ Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

+ SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

+ if (IsFSHR) {

+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

+ } else {

+ Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

+ }

+ return DAG.getZExtOrTrunc(Res, DL, VT);

+ }

+ if (VT == MVT::i8 || ExpandFunnel)

+ return SDValue();

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

- if (VT == MVT::i16)

+ if (VT == MVT::i16) {

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

DAG.getConstant(15, DL, Amt.getValueType()));

+ unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

+ return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

+ }

- unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);

- return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);

+ return Op;

}

// Try to use a packed vector operation to handle i64 on 32-bit targets when

@@ -18682,6 +19271,56 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

DAG.getIntPtrConstant(0, DL));

}

+/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

+/// try to vectorize the cast ops. This will avoid an expensive round-trip

+/// between XMM and GPR.

+static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ // TODO: Allow FP_TO_UINT.

+ SDValue CastToInt = CastToFP.getOperand(0);

+ MVT VT = CastToFP.getSimpleValueType();

+ if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

+ return SDValue();

+ MVT IntVT = CastToInt.getSimpleValueType();

+ SDValue X = CastToInt.getOperand(0);

+ MVT SrcVT = X.getSimpleValueType();

+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

+ return SDValue();

+ // See if we have 128-bit vector cast instructions for this type of cast.

+ // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

+ if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

+ IntVT != MVT::i32)

+ return SDValue();

+ unsigned SrcSize = SrcVT.getSizeInBits();

+ unsigned IntSize = IntVT.getSizeInBits();

+ unsigned VTSize = VT.getSizeInBits();

+ MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

+ MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

+ MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

+ // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

+ unsigned ToIntOpcode =

+ SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

+ unsigned ToFPOpcode =

+ IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

+ // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

+ //

+ // We are not defining the high elements (for example, zero them) because

+ // that could nullify any performance advantage that we hoped to gain from

+ // this vector op hack. We do not expect any adverse effects (like denorm

+ // penalties) with cast ops.

+ SDLoc DL(CastToFP);

+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

+ SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

+ SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

+ SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

SDLoc DL(Op);

@@ -18739,15 +19378,15 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

SmallVector<SDValue, 4> SignCvts(4);

SmallVector<SDValue, 4> Chains(4);

for (int i = 0; i != 4; ++i) {

- SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

DAG.getIntPtrConstant(i, DL));

if (IsStrict) {

SignCvts[i] =

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

- {Op.getOperand(0), Src});

+ {Op.getOperand(0), Elt});

Chains[i] = SignCvts[i].getValue(1);

} else {

- SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);

+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

}

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

@@ -18784,6 +19423,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

return Extract;

+ if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

+ return R;

if (SrcVT.isVector()) {

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

// Note: Since v2f64 is a legal type. We don't need to zero extend the

@@ -18832,21 +19474,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

SDValue ValueToStore = Src;

- if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())

+ if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

// Bitcasting to f64 here allows us to do a single 64-bit store from

// an SSE register, avoiding the store forwarding penalty that would come

// with two 32-bit stores.

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

- unsigned Size = SrcVT.getSizeInBits()/8;

+ unsigned Size = SrcVT.getStoreSize();

+ Align Alignment(Size);

MachineFunction &MF = DAG.getMachineFunction();

auto PtrVT = getPointerTy(MF.getDataLayout());

- int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);

+ int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

- Chain = DAG.getStore(

- Chain, dl, ValueToStore, StackSlot,

- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

- std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);

+ Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

+ std::pair<SDValue, SDValue> Tmp =

+ BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

if (IsStrict)

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

@@ -18854,58 +19498,40 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

return Tmp.first;

}

-std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

- SDValue StackSlot,

- SelectionDAG &DAG) const {

+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

+ EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

+ MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

// Build the FILD

- SDLoc DL(Op);

SDVTList Tys;

- bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());

+ bool useSSE = isScalarFPTypeInSSEReg(DstVT);

if (useSSE)

- Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);

+ Tys = DAG.getVTList(MVT::f80, MVT::Other);

else

- Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

+ Tys = DAG.getVTList(DstVT, MVT::Other);

- unsigned ByteSize = SrcVT.getSizeInBits() / 8;

- FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);

- MachineMemOperand *LoadMMO;

- if (FI) {

- int SSFI = FI->getIndex();

- LoadMMO = DAG.getMachineFunction().getMachineMemOperand(

- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

- MachineMemOperand::MOLoad, ByteSize, ByteSize);

- } else {

- LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();

- StackSlot = StackSlot.getOperand(1);

- }

- SDValue FILDOps[] = {Chain, StackSlot};

+ SDValue FILDOps[] = {Chain, Pointer};

SDValue Result =

- DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,

- Tys, FILDOps, SrcVT, LoadMMO);

+ DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

+ Alignment, MachineMemOperand::MOLoad);

Chain = Result.getValue(1);

if (useSSE) {

- SDValue InFlag = Result.getValue(2);

- // FIXME: Currently the FST is glued to the FILD_FLAG. This

- // shouldn't be necessary except that RFP cannot be live across

- // multiple blocks. When stackifier is fixed, they can be uncoupled.

MachineFunction &MF = DAG.getMachineFunction();

- unsigned SSFISize = Op.getValueSizeInBits() / 8;

- int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);

+ unsigned SSFISize = DstVT.getStoreSize();

+ int SSFI =

+ MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

auto PtrVT = getPointerTy(MF.getDataLayout());

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

Tys = DAG.getVTList(MVT::Other);

- SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};

+ SDValue FSTOps[] = {Chain, Result, StackSlot};

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

- MachineMemOperand::MOStore, SSFISize, SSFISize);

+ MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

- Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,

- Op.getValueType(), StoreMMO);

+ Chain =

+ DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

Result = DAG.getLoad(

- Op.getValueType(), DL, Chain, StackSlot,

+ DstVT, DL, Chain, StackSlot,

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

Chain = Result.getValue(1);

}

@@ -18948,7 +19574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

Constant *C0 = ConstantDataVector::get(*Context, CV0);

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

- SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

+ SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

SmallVector<Constant*,2> CV1;

CV1.push_back(

@@ -18958,7 +19584,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

APInt(64, 0x4530000000000000ULL))));

Constant *C1 = ConstantVector::get(CV1);

- SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

+ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

// Load the 64-bit value into an XMM register.

SDValue XR1 =

@@ -19163,13 +19789,13 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

*DAG.getContext(),

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

- SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);

+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

SDValue VBias = DAG.getMemIntrinsicNode(

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

- /*Alignment*/ 8, MachineMemOperand::MOLoad);

+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

+ MachineMemOperand::MOLoad);

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

DAG.getBitcast(MVT::v4i64, VBias));

@@ -19337,15 +19963,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

return SDValue();

// Make a 64-bit buffer, and use it to build an FILD.

- SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);

+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

if (SrcVT == MVT::i32) {

SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);

SDValue Store1 =

- DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());

+ DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

- OffsetSlot, MachinePointerInfo());

+ OffsetSlot, MPI.getWithOffset(4), 4);

std::pair<SDValue, SDValue> Tmp =

- BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);

+ BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);

if (IsStrict)

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

@@ -19361,21 +19990,17 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

}

SDValue Store =

- DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());

+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));

// For i64 source, we need to add the appropriate power of 2 if the input

// was negative. This is the same as the optimization in

// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,

// we must be careful to do the computation in x87 extended precision, not

// in SSE. (The generic code can't know it's OK to do this, or how to.)

- int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

- MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(

- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

- MachineMemOperand::MOLoad, 8, 8);

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

SDValue Ops[] = { Store, StackSlot };

- SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,

- MVT::i64, MMO);

+ SDValue Fild =

+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

+ Align(8), MachineMemOperand::MOLoad);

Chain = Fild.getValue(1);

@@ -19388,6 +20013,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

APInt FF(64, 0x5F80000000000000ULL);

SDValue FudgePtr = DAG.getConstantPool(

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

+ Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

SDValue Zero = DAG.getIntPtrConstant(0, dl);

@@ -19399,7 +20025,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

SDValue Fudge = DAG.getExtLoad(

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

- /* Alignment = */ 4);

+ CPAlignment);

Chain = Fudge.getValue(1);

// Extend everything to 80 bits to force it to be done on x87.

// TODO: Are there any fast-math-flags to propagate here?

@@ -19462,7 +20088,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

// stack slot.

MachineFunction &MF = DAG.getMachineFunction();

unsigned MemSize = DstTy.getStoreSize();

- int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);

+ int SSFI =

+ MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

@@ -19537,20 +20164,20 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

if (isScalarFPTypeInSSEReg(TheVT)) {

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

- SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);

+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

SDValue Ops[] = { Chain, StackSlot };

unsigned FLDSize = TheVT.getStoreSize();

assert(FLDSize <= MemSize && "Stack slot not big enough");

MachineMemOperand *MMO = MF.getMachineMemOperand(

- MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);

+ MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

Chain = Value.getValue(1);

}

// Build the FP_TO_INT*_IN_MEM

MachineMemOperand *MMO = MF.getMachineMemOperand(

- MPI, MachineMemOperand::MOStore, MemSize, MemSize);

+ MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

SDValue Ops[] = { Chain, Value, StackSlot };

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

DAG.getVTList(MVT::Other),

@@ -19590,14 +20217,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.

- if (InVT == MVT::v8i8) {

- if (VT != MVT::v8i64)

- return SDValue();

- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),

- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));

- return DAG.getNode(ExtendInVecOpc, dl, VT, In);

+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

+ assert(InVT == MVT::v32i8 && "Unexpected VT!");

+ return splitVectorIntUnary(Op, DAG);

}

if (Subtarget.hasInt256())

@@ -19729,7 +20351,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

"Unexpected PACK opcode");

assert(DstVT.isVector() && "VT not a vector?");

- // Requires SSE2 but AVX512 has fast vector truncate.

+ // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

if (!Subtarget.hasSSE2())

return SDValue();

@@ -19770,15 +20392,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

In = DAG.getBitcast(InVT, In);

- SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);

+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

Res = extractSubVector(Res, 0, DAG, DL, 64);

return DAG.getBitcast(DstVT, Res);

}

- // Extract lower/upper subvectors.

- unsigned NumSubElts = NumElems / 2;

- SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

- SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

+ // Split lower/upper subvectors.

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = splitVector(In, DAG, DL);

unsigned SubSizeInBits = SrcSizeInBits / 2;

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

@@ -19804,7 +20425,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

SmallVector<int, 64> Mask;

int Scale = 64 / OutVT.getScalarSizeInBits();

- scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);

+ narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

if (DstVT.is256BitVector())

@@ -19818,7 +20439,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

// Recursively pack lower/upper subvectors, concat result and pack again.

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");

- EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);

+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

@@ -19865,17 +20486,22 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

// we need to split into two 8 element vectors which we can extend to v8i32,

// truncate and concat the results. There's an additional complication if

- // the original type is v16i8. In that case we can't split the v16i8 so

- // first we pre-extend it to v16i16 which we can split to v8i16, then extend

- // to v8i32, truncate that to v8i1 and concat the two halves.

+ // the original type is v16i8. In that case we can't split the v16i8

+ // directly, so we need to shuffle high elements to low and use

+ // sign_extend_vector_inreg.

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

+ SDValue Lo, Hi;

if (InVT == MVT::v16i8) {

- // First we need to sign extend up to 256-bits so we can split that.

- InVT = MVT::v16i16;

- In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);

+ Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

+ Hi = DAG.getVectorShuffle(

+ InVT, DL, In, In,

+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

+ Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

+ } else {

+ assert(InVT == MVT::v16i16 && "Unexpected VT!");

+ Lo = extract128BitVector(In, 0, DAG, DL);

+ Hi = extract128BitVector(In, 8, DAG, DL);

}

- SDValue Lo = extract128BitVector(In, 0, DAG, DL);

- SDValue Hi = extract128BitVector(In, 8, DAG, DL);

// We're split now, just emit two truncates and a concat. The two

// truncates will trigger legalization to come back to this function.

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

@@ -19918,7 +20544,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

if (!TLI.isTypeLegal(InVT)) {

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

VT.is128BitVector()) {

- assert(Subtarget.hasVLX() && "Unexpected subtarget!");

+ assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&

+ "Unexpected subtarget!");

// The default behavior is to truncate one step, concatenate, and then

// truncate the remainder. We'd rather produce two 64-bit results and

// concatenate those.

@@ -19942,6 +20569,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

// vpmovqb/w/d, vpmovdb/w, vpmovwb

if (Subtarget.hasAVX512()) {

+ if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

+ assert(VT == MVT::v32i8 && "Unexpected VT!");

+ return splitVectorIntUnary(Op, DAG);

+ }

// word to byte only under BWI. Otherwise we have to promoted to v16i32

// and then truncate that. But we should only do that if we haven't been

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

@@ -20174,6 +20806,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

}

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

+ if (!Subtarget.hasVLX()) {

+ // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

+ // legalizer and then widened again by vector op legalization.

+ if (!IsStrict)

+ return SDValue();

+ SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

+ {Src, Zero, Zero, Zero});

+ Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

+ {Op->getOperand(0), Tmp});

+ SDValue Chain = Tmp.getValue(1);

+ Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Tmp, Chain}, dl);

+ return Tmp;

+ }

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

DAG.getUNDEF(MVT::v2f32));

@@ -20281,6 +20932,62 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");

}

+SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

+ SelectionDAG &DAG) const {

+ SDValue Src = Op.getOperand(0);

+ MVT SrcVT = Src.getSimpleValueType();

+ // If the source is in an SSE register, the node is Legal.

+ if (isScalarFPTypeInSSEReg(SrcVT))

+ return Op;

+ return LRINT_LLRINTHelper(Op.getNode(), DAG);

+SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

+ SelectionDAG &DAG) const {

+ EVT DstVT = N->getValueType(0);

+ SDValue Src = N->getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

+ // f16 must be promoted before using the lowering in this routine.

+ // fp128 does not use this lowering.

+ return SDValue();

+ }

+ SDLoc DL(N);

+ SDValue Chain = DAG.getEntryNode();

+ bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

+ // If we're converting from SSE, the stack slot needs to hold both types.

+ // Otherwise it only needs to hold the DstVT.

+ EVT OtherVT = UseSSE ? SrcVT : DstVT;

+ SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

+ if (UseSSE) {

+ assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");

+ Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

+ SDValue Ops[] = { Chain, StackPtr };

+ Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

+ /*Align*/ None, MachineMemOperand::MOLoad);

+ Chain = Src.getValue(1);

+ }

+ SDValue StoreOps[] = { Chain, Src, StackPtr };

+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

+ StoreOps, DstVT, MPI, /*Align*/ None,

+ MachineMemOperand::MOStore);

+ return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

bool IsStrict = Op->isStrictFPOpcode();

@@ -20333,6 +21040,67 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

return Tmp.first;

}

+static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

+ bool IsStrict = Op->isStrictFPOpcode();

+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

+ assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&

+ "Unexpected VT!");

+ SDLoc dl(Op);

+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

+ DAG.getConstant(0, dl, MVT::v8i16), Src,

+ DAG.getIntPtrConstant(0, dl));

+ SDValue Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

+ {Op.getOperand(0), Res});

+ Chain = Res.getValue(1);

+ } else {

+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

+ }

+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, dl);

+ return Res;

+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

+ bool IsStrict = Op->isStrictFPOpcode();

+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

+ assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&

+ "Unexpected VT!");

+ SDLoc dl(Op);

+ SDValue Res, Chain;

+ if (IsStrict) {

+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

+ DAG.getConstantFP(0, dl, MVT::v4f32), Src,

+ DAG.getIntPtrConstant(0, dl));

+ Res = DAG.getNode(

+ X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

+ {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

+ Chain = Res.getValue(1);

+ } else {

+ // FIXME: Should we use zeros for upper elements for non-strict?

+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

+ DAG.getTargetConstant(4, dl, MVT::i32));

+ }

+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

+ DAG.getIntPtrConstant(0, dl));

+ if (IsStrict)

+ return DAG.getMergeValues({Res, Chain}, dl);

+ return Res;

/// Depending on uarch and/or optimizing for size, we might prefer to use a

/// vector operation in place of the typical scalar operation.

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

@@ -20413,6 +21181,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

}

+/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

+/// This mode isn't supported in hardware on X86. But as long as we aren't

+/// compiling with trapping math, we can emulate this with

+/// floor(X + copysign(nextafter(0.5, 0.0), X)).

+static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

+ SDValue N0 = Op.getOperand(0);

+ SDLoc dl(Op);

+ MVT VT = Op.getSimpleValueType();

+ // N0 += copysign(nextafter(0.5, 0.0), N0)

+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

+ bool Ignored;

+ APFloat Point5Pred = APFloat(0.5f);

+ Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

+ Point5Pred.next(/*nextDown*/true);

+ SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

+ DAG.getConstantFP(Point5Pred, dl, VT), N0);

+ N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

+ // Truncate the result to remove fraction.

+ return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

/// The only differences between FABS and FNEG are the mask and the logic op.

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

@@ -20568,9 +21360,12 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

}

/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))

-/// style scalarized (associative) reduction patterns.

+/// style scalarized (associative) reduction patterns. Partial reductions

+/// are supported when the pointer SrcMask is non-null.

+/// TODO - move this to SelectionDAG?

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

- SmallVectorImpl<SDValue> &SrcOps) {

+ SmallVectorImpl<SDValue> &SrcOps,

+ SmallVectorImpl<APInt> *SrcMask = nullptr) {

SmallVector<SDValue, 8> Opnds;

DenseMap<SDValue, APInt> SrcOpMap;

EVT VT = MVT::Other;

@@ -20598,8 +21393,8 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

return false;

// Quit if without a constant index.

- SDValue Idx = I->getOperand(1);

- if (!isa<ConstantSDNode>(Idx))

+ auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

+ if (!Idx)

return false;

SDValue Src = I->getOperand(0);

@@ -20615,61 +21410,167 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

SrcOps.push_back(Src);

}

// Quit if element already used.

- unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();

+ unsigned CIdx = Idx->getZExtValue();

if (M->second[CIdx])

return false;

M->second.setBit(CIdx);

}

- // Quit if not all elements are used.

- for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),

- E = SrcOpMap.end();

- I != E; ++I) {

- if (!I->second.isAllOnesValue())

- return false;

+ if (SrcMask) {

+ // Collect the source partial masks.

+ for (SDValue &SrcOp : SrcOps)

+ SrcMask->push_back(SrcOpMap[SrcOp]);

+ } else {

+ // Quit if not all elements are used.

+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),

+ E = SrcOpMap.end();

+ I != E; ++I) {

+ if (!I->second.isAllOnesValue())

+ return false;

+ }

}

return true;

}

-// Check whether an OR'd tree is PTEST-able.

-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,

+// Helper function for comparing all bits of a vector against zero.

+static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,

+ const APInt &Mask,

+ const X86Subtarget &Subtarget,

+ SelectionDAG &DAG, X86::CondCode &X86CC) {

+ EVT VT = V.getValueType();

+ assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&

+ "Element Mask vs Vector bitwidth mismatch");

+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");

+ X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

+ auto MaskBits = [&](SDValue Src) {

+ if (Mask.isAllOnesValue())

+ return Src;

+ EVT SrcVT = Src.getValueType();

+ SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

+ return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

+ };

+ // For sub-128-bit vector, cast to (legal) integer and compare with zero.

+ if (VT.getSizeInBits() < 128) {

+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

+ if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))

+ return SDValue();

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

+ DAG.getBitcast(IntVT, MaskBits(V)),

+ DAG.getConstant(0, DL, IntVT));

+ }

+ // Quit if not splittable to 128/256-bit vector.

+ if (!isPowerOf2_32(VT.getSizeInBits()))

+ return SDValue();

+ // Split down to 128/256-bit vector.

+ unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;

+ while (VT.getSizeInBits() > TestSize) {

+ auto Split = DAG.SplitVector(V, DL);

+ VT = Split.first.getValueType();

+ V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

+ }

+ bool UsePTEST = Subtarget.hasSSE41();

+ if (UsePTEST) {

+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

+ V = DAG.getBitcast(TestVT, MaskBits(V));

+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

+ }

+ // Without PTEST, a masked v2i64 or-reduction is not faster than

+ // scalarization.

+ if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)

+ return SDValue();

+ V = DAG.getBitcast(MVT::v16i8, MaskBits(V));

+ V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,

+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

+ DAG.getConstant(0xFFFF, DL, MVT::i32));

+// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to

+// CMP(MOVMSK(PCMPEQB(X,0))).

+static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,

+ const SDLoc &DL,

const X86Subtarget &Subtarget,

SelectionDAG &DAG, SDValue &X86CC) {

- assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");

- if (!Subtarget.hasSSE41() || !Op->hasOneUse())

+ if (!Subtarget.hasSSE2() || !Op->hasOneUse())

return SDValue();

- SmallVector<SDValue, 8> VecIns;

- if (!matchScalarReduction(Op, ISD::OR, VecIns))

- return SDValue();

+ // Check whether we're masking/truncating an OR-reduction result, in which

+ // case track the masked bits.

+ APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());

+ switch (Op.getOpcode()) {

+ case ISD::TRUNCATE: {

+ SDValue Src = Op.getOperand(0);

+ Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

+ Op.getScalarValueSizeInBits());

+ Op = Src;

+ break;

+ }

+ case ISD::AND: {

+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

+ Mask = Cst->getAPIntValue();

+ Op = Op.getOperand(0);

+ }

+ break;

+ }

- // Quit if not 128/256-bit vector.

- EVT VT = VecIns[0].getValueType();

- if (!VT.is128BitVector() && !VT.is256BitVector())

- return SDValue();

+ SmallVector<SDValue, 8> VecIns;

+ if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {

+ EVT VT = VecIns[0].getValueType();

+ assert(llvm::all_of(VecIns,

+ [VT](SDValue V) { return VT == V.getValueType(); }) &&

+ "Reduction source vector mismatch");

+ // Quit if less than 128-bits or not splittable to 128/256-bit vector.

+ if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))

+ return SDValue();

- SDLoc DL(Op);

- MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

+ // If more than one full vector is evaluated, OR them first before PTEST.

+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

+ Slot += 2, e += 1) {

+ // Each iteration will OR 2 nodes and append the result until there is

+ // only 1 node left, i.e. the final OR'd value of all vectors.

+ SDValue LHS = VecIns[Slot];

+ SDValue RHS = VecIns[Slot + 1];

+ VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));

+ }

- // Cast all vectors into TestVT for PTEST.

- for (unsigned i = 0, e = VecIns.size(); i < e; ++i)

- VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

+ X86::CondCode CCode;

+ if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,

+ DAG, CCode)) {

+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

+ return V;

+ }

- // If more than one full vector is evaluated, OR them first before PTEST.

- for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {

- // Each iteration will OR 2 nodes and append the result until there is only

- // 1 node left, i.e. the final OR'd value of all vectors.

- SDValue LHS = VecIns[Slot];

- SDValue RHS = VecIns[Slot + 1];

- VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));

+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

+ ISD::NodeType BinOp;

+ if (SDValue Match =

+ DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {

+ X86::CondCode CCode;

+ if (SDValue V =

+ LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {

+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

+ return V;

+ }

}

- X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,

- DL, MVT::i8);

- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());

+ return SDValue();

}

/// return true if \c Op has a use that doesn't just read flags.

@@ -20814,27 +21715,14 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

/// equivalent.

-static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,

- unsigned X86CC, const SDLoc &dl,

- SelectionDAG &DAG,

- const X86Subtarget &Subtarget,

- SDValue Chain, bool IsSignaling) {

+static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

+ const SDLoc &dl, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

if (isNullConstant(Op1))

- return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);

+ return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

EVT CmpVT = Op0.getValueType();

- if (CmpVT.isFloatingPoint()) {

- if (Chain) {

- SDValue Res =

- DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

- dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

- return std::make_pair(Res, Res.getValue(1));

- }

- return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),

- SDValue());

- }

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");

@@ -20884,40 +21772,28 @@ static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

}

+ // 0-x == y --> x+y == 0

+ // 0-x != y --> x+y != 0

+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

+ Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

+ return Add.getValue(1);

+ }

+ // x == 0-y --> x+y == 0

+ // x != 0-y --> x+y != 0

+ if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

+ Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

+ return Add.getValue(1);

+ }

// Use SUB instead of CMP to enable CSE between SUB and CMP.

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

- return std::make_pair(Sub.getValue(1), SDValue());

-/// Convert a comparison if required by the subtarget.

-SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,

- SelectionDAG &DAG) const {

- // If the subtarget does not support the FUCOMI instruction, floating-point

- // comparisons have to be converted.

- bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;

- bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||

- Cmp.getOpcode() == X86ISD::STRICT_FCMPS;

- if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||

- !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||

- !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())

- return Cmp;

- // The instruction selector will select an FUCOM instruction instead of

- // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence

- // build an SDNode sequence that transfers the result from FPSW into EFLAGS:

- // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))

- SDLoc dl(Cmp);

- SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);

- SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);

- SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,

- DAG.getConstant(8, dl, MVT::i8));

- SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

- // Some 64-bit targets lack SAHF support, but they do support FCOMI.

- assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");

- return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);

+ return Sub.getValue(1);

}

/// Check if replacement of SQRT with RSQRT should be disabled.

@@ -21056,7 +21932,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

// Divide by pow2.

SDValue SRA =

- DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));

+ DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

// If we're dividing by a positive value, we're done. Otherwise, we must

// negate the result.

@@ -21211,32 +22087,30 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

/// concatenate the result back.

-static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {

- MVT VT = Op.getSimpleValueType();

+static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {

+ EVT VT = Op.getValueType();

- assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&

- "Unsupported value type for operation");

+ assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");

+ assert(Op.getOperand(0).getValueType().isInteger() &&

+ VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");

- unsigned NumElems = VT.getVectorNumElements();

SDLoc dl(Op);

SDValue CC = Op.getOperand(2);

- // Extract the LHS vectors

- SDValue LHS = Op.getOperand(0);

- SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);

- SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

+ // Extract the LHS Lo/Hi vectors

+ SDValue LHS1, LHS2;

+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

- // Extract the RHS vectors

- SDValue RHS = Op.getOperand(1);

- SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);

- SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

+ // Extract the RHS Lo/Hi vectors

+ SDValue RHS1, RHS2;

+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

// Issue the operation on the smaller types and concatenate the result back

- MVT EltVT = VT.getVectorElementType();

- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

+ EVT LoVT, HiVT;

+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),

- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));

+ DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

+ DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

}

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

@@ -21369,8 +22243,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

+ // If we have a strict compare with a vXi1 result and the input is 128/256

+ // bits we can't use a masked compare unless we have VLX. If we use a wider

+ // compare like we do for non-strict, we might trigger spurious exceptions

+ // from the upper elements. Instead emit a AVX compare and convert to mask.

unsigned Opc;

- if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {

+ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

+ (!IsStrict || Subtarget.hasVLX() ||

+ Op0.getSimpleValueType().is512BitVector())) {

assert(VT.getVectorNumElements() <= 16);

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

} else {

@@ -21466,10 +22346,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

}

- // If this is SSE/AVX CMPP, bitcast the result back to integer to match the

- // result type of SETCC. The bitcast is expected to be optimized away

- // during combining/isel.

- Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

+ if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {

+ // We emitted a compare with an XMM/YMM result. Finish converting to a

+ // mask register using a vptestm.

+ EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

+ Cmp = DAG.getBitcast(CastVT, Cmp);

+ Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

+ DAG.getConstant(0, dl, CastVT), ISD::SETNE);

+ } else {

+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match

+ // the result type of SETCC. The bitcast is expected to be optimized

+ // away during combining/isel.

+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

+ }

if (IsStrict)

return DAG.getMergeValues({Cmp, Chain}, dl);

@@ -21563,7 +22452,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

// Break 256-bit integer vector compare into smaller ones.

if (VT.is256BitVector() && !Subtarget.hasInt256())

- return Lower256IntVSETCC(Op, DAG);

+ return splitIntVSETCC(Op, DAG);

+ if (VT == MVT::v32i16 || VT == MVT::v64i8) {

+ assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");

+ return splitIntVSETCC(Op, DAG);

+ }

// If this is a SETNE against the signed minimum value, change it to SETGT.

// If this is a SETNE against the signed maximum value, change it to SETLT.

@@ -21812,9 +22706,8 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

/// corresponding X86 condition code constant in X86CC.

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

ISD::CondCode CC, const SDLoc &dl,

- SelectionDAG &DAG, SDValue &X86CC,

- SDValue &Chain,

- bool IsSignaling) const {

+ SelectionDAG &DAG,

+ SDValue &X86CC) const {

// Optimize to BT if possible.

// Lower (X & (1 << N)) == 0 to BT(X, N).

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

@@ -21825,13 +22718,12 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

return BT;

}

- // Try to use PTEST for a tree ORs equality compared with 0.

+ // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.

// TODO: We could do AND tree with all 1s as well by using the C flag.

- if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&

- (CC == ISD::SETEQ || CC == ISD::SETNE)) {

- if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))

- return PTEST;

- }

+ if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))

+ if (SDValue CmpZ =

+ MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))

+ return CmpZ;

// Try to lower using KORTEST or KTEST.

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

@@ -21873,17 +22765,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

}

- bool IsFP = Op1.getSimpleValueType().isFloatingPoint();

- X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);

- if (CondCode == X86::COND_INVALID)

- return SDValue();

+ X86::CondCode CondCode =

+ TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

+ assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");

- std::pair<SDValue, SDValue> Tmp =

- EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);

- SDValue EFLAGS = Tmp.first;

- if (Chain)

- Chain = Tmp.second;

- EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);

+ SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

return EFLAGS;

}

@@ -21920,18 +22806,32 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

}

- SDValue X86CC;

- SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,

- Op.getOpcode() == ISD::STRICT_FSETCCS);

- if (!EFLAGS)

- return SDValue();

+ if (Op0.getSimpleValueType().isInteger()) {

+ SDValue X86CC;

+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

+ }

- SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

+ // Handle floating point.

+ X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

+ if (CondCode == X86::COND_INVALID)

+ return SDValue();

- if (IsStrict)

- return DAG.getMergeValues({Res, Chain}, dl);

+ SDValue EFLAGS;

+ if (IsStrict) {

+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

+ EFLAGS =

+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

+ Chain = EFLAGS.getValue(1);

+ } else {

+ EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

+ }

- return Res;

+ SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

}

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

@@ -21946,9 +22846,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const

// Recreate the carry if needed.

EVT CarryVT = Carry.getValueType();

- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

- Carry, DAG.getConstant(NegOne, DL, CarryVT));

+ Carry, DAG.getAllOnesConstant(DL, CarryVT));

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

@@ -22024,7 +22923,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

static bool isX86LogicalCmp(SDValue Op) {

unsigned Opc = Op.getOpcode();

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

- Opc == X86ISD::SAHF)

+ Opc == X86ISD::FCMP)

return true;

if (Op.getResNo() == 1 &&

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

@@ -22057,9 +22956,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

// are available or VBLENDV if AVX is available.

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

- if (Cond.getOpcode() == ISD::SETCC &&

- ((Subtarget.hasSSE2() && VT == MVT::f64) ||

- (Subtarget.hasSSE1() && VT == MVT::f32)) &&

+ if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

bool IsAlwaysSignaling;

@@ -22115,45 +23012,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

}

// AVX512 fallback is to lower selects of scalar floats to masked moves.

- if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {

+ if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

}

- // For v64i1 without 64-bit support we need to split and rejoin.

- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

- assert(Subtarget.hasBWI() && "Expected BWI to be legal");

- SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);

- SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);

- SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);

- SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);

- SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);

- SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);

- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

- }

- if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {

- SDValue Op1Scalar;

- if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))

- Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);

- else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))

- Op1Scalar = Op1.getOperand(0);

- SDValue Op2Scalar;

- if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))

- Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);

- else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))

- Op2Scalar = Op2.getOperand(0);

- if (Op1Scalar.getNode() && Op2Scalar.getNode()) {

- SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,

- Op1Scalar, Op2Scalar);

- if (newSelect.getValueSizeInBits() == VT.getSizeInBits())

- return DAG.getBitcast(VT, newSelect);

- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);

- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,

- DAG.getIntPtrConstant(0, DL));

- }

if (Cond.getOpcode() == ISD::SETCC) {

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

Cond = NewCond;

@@ -22175,12 +23038,28 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

isNullConstant(Cond.getOperand(1).getOperand(1))) {

SDValue Cmp = Cond.getOperand(1);

+ SDValue CmpOp0 = Cmp.getOperand(0);

unsigned CondCode = Cond.getConstantOperandVal(0);

- if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

+ // Special handling for __builtin_ffs(X) - 1 pattern which looks like

+ // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

+ // handle to keep the CMP with 0. This should be removed by

+ // optimizeCompareInst by using the flags from the BSR/TZCNT used for the

+ // cttz_zero_undef.

+ auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

+ return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

+ Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

+ };

+ if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&

+ ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

+ (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

+ // Keep Cmp.

+ } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

- SDValue CmpOp0 = Cmp.getOperand(0);

+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

+ SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

// Apply further optimizations for special cases

// (select (x != 0), -1, 0) -> neg & sbb

@@ -22188,31 +23067,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

if (isNullConstant(Y) &&

(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

- SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);

- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

Zero = DAG.getConstant(0, DL, Op.getValueType());

- return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);

+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));

}

- Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,

+ Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,

CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));

- Cmp = ConvertCmpIfNecessary(Cmp, DAG);

- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());

SDValue Res = // Res = 0 or -1.

- DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);

+ DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))

Res = DAG.getNOT(DL, Res, Res.getValueType());

- if (!isNullConstant(Op2))

- Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);

- return Res;

+ return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);

} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&

Cmp.getOperand(0).getOpcode() == ISD::AND &&

isOneConstant(Cmp.getOperand(0).getOperand(1))) {

- SDValue CmpOp0 = Cmp.getOperand(0);

SDValue Src1, Src2;

// true if Op2 is XOR or OR operator and one of its operands

// is equal to Op1

@@ -22265,7 +23138,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

SDValue Cmp = Cond.getOperand(1);

bool IllegalFPCMov = false;

if (VT.isFloatingPoint() && !VT.isVector() &&

- !isScalarFPTypeInSSEReg(VT)) // FPStack?

+ !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

@@ -22311,7 +23184,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

// a >= b ? -1 : 0 -> RES = setcc_carry

// a >= b ? 0 : -1 -> RES = ~setcc_carry

if (Cond.getOpcode() == X86ISD::SUB) {

- Cond = ConvertCmpIfNecessary(Cond, DAG);

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

@@ -22333,7 +23205,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

if (T1.getValueType() == T2.getValueType() &&

- // Blacklist CopyFromReg to avoid partial register stalls.

+ // Exclude CopyFromReg to avoid partial register stalls.

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

CC, Cond);

@@ -22570,14 +23442,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

InVT.getVectorElementType() == MVT::i32) &&

"Unexpected element type");

- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.

- if (InVT == MVT::v8i8) {

- if (VT != MVT::v8i64)

- return SDValue();

- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),

- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));

- return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);

+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

+ assert(InVT == MVT::v32i8 && "Unexpected VT!");

+ return splitVectorIntUnary(Op, DAG);

}

if (Subtarget.hasInt256())

@@ -22620,23 +23487,19 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

if (!Store->isSimple())

return SDValue();

- EVT StoreVT = StoredVal.getValueType();

- unsigned NumElems = StoreVT.getVectorNumElements();

- unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;

- unsigned HalfAlign = (128 == HalfSize ? 16 : 32);

SDLoc DL(Store);

- SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);

- SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);

+ SDValue Value0, Value1;

+ std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

+ unsigned HalfOffset = Value0.getValueType().getStoreSize();

SDValue Ptr0 = Store->getBasePtr();

- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);

- unsigned Alignment = Store->getAlignment();

+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);

SDValue Ch0 =

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

- Alignment, Store->getMemOperand()->getFlags());

+ Store->getOriginalAlign(),

+ Store->getMemOperand()->getFlags());

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

- Store->getPointerInfo().getWithOffset(HalfAlign),

- MinAlign(Alignment, HalfAlign),

+ Store->getPointerInfo().getWithOffset(HalfOffset),

+ Store->getOriginalAlign(),

Store->getMemOperand()->getFlags());

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

}

@@ -22659,7 +23522,6 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

MVT StoreSVT = StoreVT.getScalarType();

unsigned NumElems = StoreVT.getVectorNumElements();

unsigned ScalarSize = StoreSVT.getStoreSize();

- unsigned Alignment = Store->getAlignment();

SDLoc DL(Store);

SmallVector<SDValue, 4> Stores;

@@ -22670,7 +23532,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

DAG.getIntPtrConstant(i, DL));

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

Store->getPointerInfo().getWithOffset(Offset),

- MinAlign(Alignment, Offset),

+ Store->getOriginalAlign(),

Store->getMemOperand()->getFlags());

Stores.push_back(Ch);

}

@@ -22699,7 +23561,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

}

@@ -22711,7 +23573,9 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

// and each half can execute independently. Some cores would split the op into

// halves anyway, so the concat (vinsertf128) is purely an extra op.

MVT StoreVT = StoredVal.getSimpleValueType();

- if (StoreVT.is256BitVector()) {

+ if (StoreVT.is256BitVector() ||

+ ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

+ !Subtarget.hasBWI())) {

SmallVector<SDValue, 4> CatOps;

if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))

return splitVectorStore(St, DAG);

@@ -22738,7 +23602,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

DAG.getIntPtrConstant(0, dl));

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

}

assert(Subtarget.hasSSE1() && "Expected SSE");

@@ -22773,7 +23637,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

"Expected AVX512F without AVX512DQI");

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

- Ld->getPointerInfo(), Ld->getAlignment(),

+ Ld->getPointerInfo(), Ld->getOriginalAlign(),

Ld->getMemOperand()->getFlags());

// Replace chain users with the new chain.

@@ -22801,163 +23665,44 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

Op.getOperand(1).hasOneUse());

}

-/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the

-/// SETCC node has a single use.

-static bool isXor1OfSetCC(SDValue Op) {

- if (Op.getOpcode() != ISD::XOR)

- return false;

- if (isOneConstant(Op.getOperand(1)))

- return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

- Op.getOperand(0).hasOneUse();

- return false;

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

- bool addTest = true;

SDValue Chain = Op.getOperand(0);

SDValue Cond = Op.getOperand(1);

SDValue Dest = Op.getOperand(2);

SDLoc dl(Op);

- SDValue CC;

- bool Inverted = false;

- if (Cond.getOpcode() == ISD::SETCC) {

- // Check for setcc([su]{add,sub,mul}o == 0).

- if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

- isNullConstant(Cond.getOperand(1)) &&

- Cond.getOperand(0).getResNo() == 1 &&

- (Cond.getOperand(0).getOpcode() == ISD::SADDO ||

- Cond.getOperand(0).getOpcode() == ISD::UADDO ||

- Cond.getOperand(0).getOpcode() == ISD::SSUBO ||

- Cond.getOperand(0).getOpcode() == ISD::USUBO ||

- Cond.getOperand(0).getOpcode() == ISD::SMULO ||

- Cond.getOperand(0).getOpcode() == ISD::UMULO)) {

- Inverted = true;

- Cond = Cond.getOperand(0);

- } else {

- if (SDValue NewCond = LowerSETCC(Cond, DAG))

- Cond = NewCond;

- }

-#if 0

- // FIXME: LowerXALUO doesn't handle these!!

- else if (Cond.getOpcode() == X86ISD::ADD ||

- Cond.getOpcode() == X86ISD::SUB ||

- Cond.getOpcode() == X86ISD::SMUL ||

- Cond.getOpcode() == X86ISD::UMUL)

- Cond = LowerXALUO(Cond, DAG);

-#endif

+ if (Cond.getOpcode() == ISD::SETCC &&

+ Cond.getOperand(0).getValueType() != MVT::f128) {

+ SDValue LHS = Cond.getOperand(0);

+ SDValue RHS = Cond.getOperand(1);

+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

- // Look pass (and (setcc_carry (cmp ...)), 1).

- if (Cond.getOpcode() == ISD::AND &&

- Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

- isOneConstant(Cond.getOperand(1)))

- Cond = Cond.getOperand(0);

+ // Special case for

+ // setcc([su]{add,sub,mul}o == 0)

+ // setcc([su]{add,sub,mul}o != 1)

+ if (ISD::isOverflowIntrOpRes(LHS) &&

+ (CC == ISD::SETEQ || CC == ISD::SETNE) &&

+ (isNullConstant(RHS) || isOneConstant(RHS))) {

+ SDValue Value, Overflow;

+ X86::CondCode X86Cond;

+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

- // If condition flag is set by a X86ISD::CMP, then use it as the condition

- // setting operand in place of the X86ISD::SETCC.

- unsigned CondOpcode = Cond.getOpcode();

- if (CondOpcode == X86ISD::SETCC ||

- CondOpcode == X86ISD::SETCC_CARRY) {

- CC = Cond.getOperand(0);

+ if ((CC == ISD::SETEQ) == isNullConstant(RHS))

+ X86Cond = X86::GetOppositeBranchCondition(X86Cond);

- SDValue Cmp = Cond.getOperand(1);

- unsigned Opc = Cmp.getOpcode();

- // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??

- if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {

- Cond = Cmp;

- addTest = false;

- } else {

- switch (cast<ConstantSDNode>(CC)->getZExtValue()) {

- default: break;

- case X86::COND_O:

- case X86::COND_B:

- // These can only come from an arithmetic instruction with overflow,

- // e.g. SADDO, UADDO.

- Cond = Cond.getOperand(1);

- addTest = false;

- break;

- }

+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ Overflow);

}

- }

- CondOpcode = Cond.getOpcode();

- if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

- CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

- CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

- SDValue Value;

- X86::CondCode X86Cond;

- std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

- if (Inverted)

- X86Cond = X86::GetOppositeBranchCondition(X86Cond);

+ if (LHS.getSimpleValueType().isInteger()) {

+ SDValue CCVal;

+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ EFLAGS);

+ }

- CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

- addTest = false;

- } else {

- unsigned CondOpc;

- if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {

- SDValue Cmp = Cond.getOperand(0).getOperand(1);

- if (CondOpc == ISD::OR) {

- // Also, recognize the pattern generated by an FCMP_UNE. We can emit

- // two branches instead of an explicit OR instruction with a

- // separate test.

- if (Cmp == Cond.getOperand(1).getOperand(1) &&

- isX86LogicalCmp(Cmp)) {

- CC = Cond.getOperand(0).getOperand(0);

- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

- Chain, Dest, CC, Cmp);

- CC = Cond.getOperand(1).getOperand(0);

- Cond = Cmp;

- addTest = false;

- }

- } else { // ISD::AND

- // Also, recognize the pattern generated by an FCMP_OEQ. We can emit

- // two branches instead of an explicit AND instruction with a

- // separate test. However, we only do this if this block doesn't

- // have a fall-through edge, because this requires an explicit

- // jmp when the condition is false.

- if (Cmp == Cond.getOperand(1).getOperand(1) &&

- isX86LogicalCmp(Cmp) &&

- Op.getNode()->hasOneUse()) {

- X86::CondCode CCode0 =

- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);

- CCode0 = X86::GetOppositeBranchCondition(CCode0);

- CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);

- SDNode *User = *Op.getNode()->use_begin();

- // Look for an unconditional branch following this conditional branch.

- // We need this because we need to reverse the successors in order

- // to implement FCMP_OEQ.

- if (User->getOpcode() == ISD::BR) {

- SDValue FalseBB = User->getOperand(1);

- SDNode *NewBR =

- DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

- assert(NewBR == User);

- (void)NewBR;

- Dest = FalseBB;

- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,

- Dest, CC, Cmp);

- X86::CondCode CCode1 =

- (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);

- CCode1 = X86::GetOppositeBranchCondition(CCode1);

- CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);

- Cond = Cmp;

- addTest = false;

- }

- } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {

- // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.

- // It should be transformed during dag combiner except when the condition

- // is set by a arithmetics with overflow node.

- X86::CondCode CCode =

- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);

- CCode = X86::GetOppositeBranchCondition(CCode);

- CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

- Cond = Cond.getOperand(0).getOperand(1);

- addTest = false;

- } else if (Cond.getOpcode() == ISD::SETCC &&

- cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {

+ if (CC == ISD::SETOEQ) {

// For FCMP_OEQ, we can emit

// two branches instead of an explicit AND instruction with a

// separate test. However, we only do this if this block doesn't

@@ -22976,59 +23721,65 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

(void)NewBR;

Dest = FalseBB;

- SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

- Cond.getOperand(0), Cond.getOperand(1));

- Cmp = ConvertCmpIfNecessary(Cmp, DAG);

- CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

- Chain, Dest, CC, Cmp);

- CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

- Cond = Cmp;

- addTest = false;

+ SDValue Cmp =

+ DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

+ Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

+ CCVal, Cmp);

+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ Cmp);

}

- } else if (Cond.getOpcode() == ISD::SETCC &&

- cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {

+ } else if (CC == ISD::SETUNE) {

// For FCMP_UNE, we can emit

// two branches instead of an explicit OR instruction with a

// separate test.

- SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

- Cond.getOperand(0), Cond.getOperand(1));

- Cmp = ConvertCmpIfNecessary(Cmp, DAG);

- CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

- Chain, Dest, CC, Cmp);

- CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

- Cond = Cmp;

- addTest = false;

+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

+ Chain =

+ DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ Cmp);

+ } else {

+ X86::CondCode X86Cond =

+ TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ Cmp);

}

- if (addTest) {

- // Look pass the truncate if the high bits are known zero.

- if (isTruncWithZeroHighBitsInput(Cond, DAG))

- Cond = Cond.getOperand(0);

+ if (ISD::isOverflowIntrOpRes(Cond)) {

+ SDValue Value, Overflow;

+ X86::CondCode X86Cond;

+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

- // We know the result of AND is compared against zero. Try to match

- // it to BT.

- if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

- SDValue BTCC;

- if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {

- CC = BTCC;

- Cond = BT;

- addTest = false;

- }

+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ Overflow);

}

- if (addTest) {

- X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;

- CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

- Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);

- }

- Cond = ConvertCmpIfNecessary(Cond, DAG);

- return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

- Chain, Dest, CC, Cond);

+ // Look past the truncate if the high bits are known zero.

+ if (isTruncWithZeroHighBitsInput(Cond, DAG))

+ Cond = Cond.getOperand(0);

+ EVT CondVT = Cond.getValueType();

+ // Add an AND with 1 if we don't already have one.

+ if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

+ Cond =

+ DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

+ SDValue LHS = Cond;

+ SDValue RHS = DAG.getConstant(0, dl, CondVT);

+ SDValue CCVal;

+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

+ EFLAGS);

}

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

@@ -23041,9 +23792,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

SelectionDAG &DAG) const {

MachineFunction &MF = DAG.getMachineFunction();

bool SplitStack = MF.shouldSplitStack();

- bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();

+ bool EmitStackProbeCall = hasStackProbeSymbol(MF);

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

- SplitStack || EmitStackProbe;

+ SplitStack || EmitStackProbeCall;

SDLoc dl(Op);

// Get the inputs.

@@ -23067,12 +23818,22 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"

" not tell us which reg is the stack pointer!");

- SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

- Chain = SP.getValue(1);

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

- const Align StackAlign(TFI.getStackAlignment());

- Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

- if (Alignment && Alignment > StackAlign)

+ const Align StackAlign = TFI.getStackAlign();

+ if (hasInlineStackProbe(MF)) {

+ MachineRegisterInfo &MRI = MF.getRegInfo();

+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);

+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

+ Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

+ DAG.getRegister(Vreg, SPTy));

+ } else {

+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

+ Chain = SP.getValue(1);

+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

+ }

+ if (Alignment && *Alignment > StackAlign)

Result =

DAG.getNode(ISD::AND, dl, VT, Result,

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

@@ -23203,14 +23964,13 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

// Decide which area this value should be read from.

// TODO: Implement the AMD64 ABI in its entirety. This simple

// selection mechanism works only for the basic types.

- if (ArgVT == MVT::f80) {

- llvm_unreachable("va_arg for f80 not yet implemented");

- } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

+ assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");

+ if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

- } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {

- ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

} else {

- llvm_unreachable("Unhandled argument type in LowerVAARG");

+ assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&

+ "Unhandled argument type in LowerVAARG");

+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

}

if (ArgMode == 2) {

@@ -23227,11 +23987,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

DAG.getConstant(Align, dl, MVT::i32)};

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

SDValue VAARG = DAG.getMemIntrinsicNode(

- X86ISD::VAARG_64, dl,

- VTs, InstOps, MVT::i64,

- MachinePointerInfo(SV),

- /*Align=*/0,

- MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

+ X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

+ /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

Chain = VAARG.getValue(1);

// Load the next argument and return it

@@ -23255,9 +24012,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

SDLoc DL(Op);

- return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,

- DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,

- false, false,

+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),

+ Align(8), /*isVolatile*/ false, false, false,

MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

}

@@ -23319,7 +24075,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

for (unsigned i = 0; i != NumElts; ++i) {

SDValue CurrentOp = SrcOp->getOperand(i);

if (CurrentOp->isUndef()) {

- Elts.push_back(CurrentOp);

+ // Must produce 0s in the correct bits.

+ Elts.push_back(DAG.getConstant(0, dl, ElementType));

continue;

}

auto *ND = cast<ConstantSDNode>(CurrentOp);

@@ -23331,7 +24088,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

for (unsigned i = 0; i != NumElts; ++i) {

SDValue CurrentOp = SrcOp->getOperand(i);

if (CurrentOp->isUndef()) {

- Elts.push_back(CurrentOp);

+ // Must produce 0s in the correct bits.

+ Elts.push_back(DAG.getConstant(0, dl, ElementType));

continue;

}

auto *ND = cast<ConstantSDNode>(CurrentOp);

@@ -23343,7 +24101,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

for (unsigned i = 0; i != NumElts; ++i) {

SDValue CurrentOp = SrcOp->getOperand(i);

if (CurrentOp->isUndef()) {

- Elts.push_back(CurrentOp);

+ // All shifted in bits must be the same so use 0.

+ Elts.push_back(DAG.getConstant(0, dl, ElementType));

continue;

}

auto *ND = cast<ConstantSDNode>(CurrentOp);

@@ -24001,8 +24760,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

SDValue LHS = Op.getOperand(1);

SDValue RHS = Op.getOperand(2);

+ // Some conditions require the operands to be swapped.

+ if (CC == ISD::SETLT || CC == ISD::SETLE)

+ std::swap(LHS, RHS);

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

- SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);

SDValue SetCC;

switch (CC) {

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

@@ -24018,18 +24780,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

break;

}

case ISD::SETGT: // (CF = 0 and ZF = 0)

+ case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

break;

- case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.

- SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);

- break;

}

case ISD::SETGE: // CF = 0

+ case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

break;

- case ISD::SETLE: // The condition is opposite to GE. Swap the operands.

- SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);

- break;

default:

llvm_unreachable("Unexpected illegal condition!");

}

@@ -24478,6 +25236,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

// Clamp out of bounds shift amounts since they will otherwise be masked

// to 8-bits which may make it no longer out of bounds.

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

+ if (ShiftAmount == 0)

+ return Op.getOperand(1);

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

Op.getOperand(0), Op.getOperand(1),

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

@@ -24537,19 +25298,23 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

TLI.getPointerTy(DAG.getDataLayout()));

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);

+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

// If source is undef or we know it won't be used, use a zero vector

// to break register dependency.

// TODO: use undef instead and let BreakFalseDeps deal with it?

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

+ // Cast mask to an integer type.

+ Mask = DAG.getBitcast(MaskVT, Mask);

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());

- return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);

+ SDValue Res =

+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());

+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

}

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

@@ -24574,7 +25339,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

if (Mask.getValueType() != MaskVT)

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);

+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

// If source is undef or we know it won't be used, use a zero vector

// to break register dependency.

// TODO: use undef instead and let BreakFalseDeps deal with it?

@@ -24584,9 +25349,10 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());

- return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);

+ SDValue Res =

+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());

+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

}

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

@@ -24612,11 +25378,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);

+ SDVTList VTs = DAG.getVTList(MVT::Other);

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

- SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(

- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());

- return Res.getValue(1);

+ SDValue Res =

+ DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());

+ return Res;

}

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

@@ -24775,13 +25542,11 @@ static SDValue

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

SelectionDAG &DAG) {

SDVTList VTs = DAG.getVTList(MVT::Other);

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

SDValue Ops[] = { Chain, Val, Ptr, Undef };

- return SignedSat ?

- DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :

- DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);

+ unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

}

/// Emit Masked Truncating Store with signed or unsigned saturation.

@@ -24789,12 +25554,10 @@ static SDValue

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

MachineMemOperand *MMO, SelectionDAG &DAG) {

SDVTList VTs = DAG.getVTList(MVT::Other);

SDValue Ops[] = { Chain, Val, Ptr, Mask };

- return SignedSat ?

- DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :

- DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);

+ unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

}

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

@@ -25144,7 +25907,7 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

}

-unsigned X86TargetLowering::getExceptionPointerRegister(

+Register X86TargetLowering::getExceptionPointerRegister(

const Constant *PersonalityFn) const {

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

@@ -25152,7 +25915,7 @@ unsigned X86TargetLowering::getExceptionPointerRegister(

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

}

-unsigned X86TargetLowering::getExceptionSelectorRegister(

+Register X86TargetLowering::getExceptionSelectorRegister(

const Constant *PersonalityFn) const {

// Funclet personalities don't use selectors (the runtime does the selection).

assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));

@@ -25176,7 +25939,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&

"Invalid Frame Register!");

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

- unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

+ Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

@@ -25390,93 +26153,51 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

2 Round to +inf

3 Round to -inf

- To perform the conversion, we do:

- (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)

+ To perform the conversion, we use a packed lookup table of the four 2-bit

+ values that we can index by FPSP[11:10]

+ 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

+ (0x2d >> ((FPSR & 0xc00) >> 9)) & 3

MachineFunction &MF = DAG.getMachineFunction();

- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

- const Align StackAlignment(TFI.getStackAlignment());

MVT VT = Op.getSimpleValueType();

SDLoc DL(Op);

// Save FP Control Word to stack slot

- int SSFI =

- MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);

+ int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

SDValue StackSlot =

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

- MachineMemOperand *MMO =

- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),

- MachineMemOperand::MOStore, 2, 2);

+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

- SDValue Ops[] = { DAG.getEntryNode(), StackSlot };

- SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

- DAG.getVTList(MVT::Other),

- Ops, MVT::i16, MMO);

+ SDValue Chain = Op.getOperand(0);

+ SDValue Ops[] = {Chain, StackSlot};

+ Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

+ DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

+ Align(2), MachineMemOperand::MOStore);

// Load FP Control Word from stack slot

- SDValue CWD =

- DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

+ Chain = CWD.getValue(1);

- // Transform as necessary

- SDValue CWD1 =

- DAG.getNode(ISD::SRL, DL, MVT::i16,

- DAG.getNode(ISD::AND, DL, MVT::i16,

- CWD, DAG.getConstant(0x800, DL, MVT::i16)),

- DAG.getConstant(11, DL, MVT::i8));

- SDValue CWD2 =

+ // Mask and turn the control bits into a shift for the lookup table.

+ SDValue Shift =

DAG.getNode(ISD::SRL, DL, MVT::i16,

DAG.getNode(ISD::AND, DL, MVT::i16,

- CWD, DAG.getConstant(0x400, DL, MVT::i16)),

+ CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

DAG.getConstant(9, DL, MVT::i8));

+ Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

+ SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

SDValue RetVal =

- DAG.getNode(ISD::AND, DL, MVT::i16,

- DAG.getNode(ISD::ADD, DL, MVT::i16,

- DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),

- DAG.getConstant(1, DL, MVT::i16)),

- DAG.getConstant(3, DL, MVT::i16));

- return DAG.getNode((VT.getSizeInBits() < 16 ?

- ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);

-// Split an unary integer op into 2 half sized ops.

-static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

- MVT VT = Op.getSimpleValueType();

- unsigned NumElems = VT.getVectorNumElements();

- unsigned SizeInBits = VT.getSizeInBits();

- MVT EltVT = VT.getVectorElementType();

- SDValue Src = Op.getOperand(0);

- assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&

- "Src and Op should have the same element type!");

+ DAG.getNode(ISD::AND, DL, MVT::i32,

+ DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

+ DAG.getConstant(3, DL, MVT::i32));

- // Extract the Lo/Hi vectors

- SDLoc dl(Op);

- SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);

- SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

+ RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

- MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);

- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

- DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),

- DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));

-// Decompose 256-bit ops into smaller 128-bit ops.

-static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {

- assert(Op.getSimpleValueType().is256BitVector() &&

- Op.getSimpleValueType().isInteger() &&

- "Only handle AVX 256-bit vector integer operation");

- return LowerVectorIntUnary(Op, DAG);

-// Decompose 512-bit ops into smaller 256-bit ops.

-static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {

- assert(Op.getSimpleValueType().is512BitVector() &&

- Op.getSimpleValueType().isInteger() &&

- "Only handle AVX 512-bit vector integer operation");

- return LowerVectorIntUnary(Op, DAG);

+ return DAG.getMergeValues({RetVal, Chain}, DL);

}

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

@@ -25499,7 +26220,7 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

// Split vector, it's Lo and Hi parts will be handled in next iteration.

if (NumElems > 16 ||

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

- return LowerVectorIntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&

@@ -25609,11 +26330,11 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

// Decompose 256-bit ops into smaller 128-bit ops.

if (VT.is256BitVector() && !Subtarget.hasInt256())

- return Lower256IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

// Decompose 512-bit ops into smaller 256-bit ops.

if (VT.is512BitVector() && !Subtarget.hasBWI())

- return Lower512IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

@@ -25679,64 +26400,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

}

-/// Break a 256-bit integer operation into two new 128-bit ones and then

-/// concatenate the result back.

-static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {

- MVT VT = Op.getSimpleValueType();

- assert(VT.is256BitVector() && VT.isInteger() &&

- "Unsupported value type for operation");

- unsigned NumElems = VT.getVectorNumElements();

- SDLoc dl(Op);

- // Extract the LHS vectors

- SDValue LHS = Op.getOperand(0);

- SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);

- SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

- // Extract the RHS vectors

- SDValue RHS = Op.getOperand(1);

- SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);

- SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

- MVT EltVT = VT.getVectorElementType();

- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),

- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));

-/// Break a 512-bit integer operation into two new 256-bit ones and then

-/// concatenate the result back.

-static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {

- MVT VT = Op.getSimpleValueType();

- assert(VT.is512BitVector() && VT.isInteger() &&

- "Unsupported value type for operation");

- unsigned NumElems = VT.getVectorNumElements();

- SDLoc dl(Op);

- // Extract the LHS vectors

- SDValue LHS = Op.getOperand(0);

- SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);

- SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

- // Extract the RHS vectors

- SDValue RHS = Op.getOperand(1);

- SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);

- SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

- MVT EltVT = VT.getVectorElementType();

- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),

- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

MVT VT = Op.getSimpleValueType();

@@ -25747,10 +26410,13 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

return DAG.getNode(ISD::XOR, SDLoc(Op), VT,

Op.getOperand(0), Op.getOperand(1));

+ if (VT == MVT::v32i16 || VT == MVT::v64i8)

+ return splitVectorIntBinary(Op, DAG);

assert(Op.getSimpleValueType().is256BitVector() &&

Op.getSimpleValueType().isInteger() &&

"Only handle AVX 256-bit vector integer operation");

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

}

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

@@ -25795,10 +26461,13 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

return SDValue();

}

+ if (VT == MVT::v32i16 || VT == MVT::v64i8)

+ return splitVectorIntBinary(Op, DAG);

assert(Op.getSimpleValueType().is256BitVector() &&

Op.getSimpleValueType().isInteger() &&

"Only handle AVX 256-bit vector integer operation");

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

}

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

@@ -25828,9 +26497,12 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

assert(VT.isInteger() &&

"Only handle AVX 256-bit vector integer operation");

- return Lower256IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

}

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

+ return splitVectorIntUnary(Op, DAG);

// Default to expand.

return SDValue();

}

@@ -25840,7 +26512,10 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {

// For AVX1 cases, split to use legal ops (everything but v4i64).

if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

+ if (VT == MVT::v32i16 || VT == MVT::v64i8)

+ return splitVectorIntBinary(Op, DAG);

SDLoc DL(Op);

unsigned Opcode = Op.getOpcode();

@@ -25884,7 +26559,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

// Decompose 256-bit ops into 128-bit ops.

if (VT.is256BitVector() && !Subtarget.hasInt256())

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

+ return splitVectorIntBinary(Op, DAG);

SDValue A = Op.getOperand(0);

SDValue B = Op.getOperand(1);

@@ -26030,7 +26708,10 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

// Decompose 256-bit ops into 128-bit ops.

if (VT.is256BitVector() && !Subtarget.hasInt256())

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

+ return splitVectorIntBinary(Op, DAG);

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||

@@ -26119,41 +26800,9 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

}

- // For signed 512-bit vectors, split into 256-bit vectors to allow the

- // sign-extension to occur.

- if (VT == MVT::v64i8 && IsSigned)

- return split512IntArith(Op, DAG);

- // Signed AVX2 implementation - extend xmm subvectors to ymm.

- if (VT == MVT::v32i8 && IsSigned) {

- MVT ExVT = MVT::v16i16;

- SDValue ALo = extract128BitVector(A, 0, DAG, dl);

- SDValue BLo = extract128BitVector(B, 0, DAG, dl);

- SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);

- SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);

- ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);

- BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);

- AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);

- BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);

- SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

- SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

- Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);

- Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);

- // Bitcast back to VT and then pack all the even elements from Lo and Hi.

- // Shuffle lowering should turn this into PACKUS+PERMQ

- Lo = DAG.getBitcast(VT, Lo);

- Hi = DAG.getBitcast(VT, Hi);

- return DAG.getVectorShuffle(VT, dl, Lo, Hi,

- { 0, 2, 4, 6, 8, 10, 12, 14,

- 16, 18, 20, 22, 24, 26, 28, 30,

- 32, 34, 36, 38, 40, 42, 44, 46,

- 48, 50, 52, 54, 56, 58, 60, 62});

- }

- // For signed v16i8 and all unsigned vXi8 we will unpack the low and high

- // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,

- // shift the results and pack the half lane results back together.

+ // For vXi8 we will unpack the low and high half of each 128 bit lane to widen

+ // to a vXi16 type. Do the multiplies, shift the results and pack the half

+ // lane results back together.

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

@@ -26267,9 +26916,12 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&

"Unexpected argument type for lowering");

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

Entry.Node = StackPtr;

InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,

- MachinePointerInfo(), /* Alignment = */ 16);

+ MPI, /* Alignment = */ 16);

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

Entry.Ty = PointerType::get(ArgTy,0);

Entry.IsSExt = false;

@@ -26410,7 +27062,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,

return ArithmeticShiftRight64(ShiftAmt);

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

- VT == MVT::v64i8) {

+ (Subtarget.hasBWI() && VT == MVT::v64i8)) {

unsigned NumElts = VT.getVectorNumElements();

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

@@ -26856,8 +27508,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

- (VT == MVT::v16i8 || VT == MVT::v64i8 ||

- (VT == MVT::v32i8 && Subtarget.hasInt256())) &&

+ (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&

!Subtarget.hasXOP()) {

int NumElts = VT.getVectorNumElements();

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

@@ -26920,12 +27572,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

ISD::SETGT);

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

} else if (Subtarget.hasSSE41()) {

- // On SSE41 targets we make use of the fact that VSELECT lowers

- // to PBLENDVB which selects bytes based just on the sign bit.

+ // On SSE41 targets we can use PBLENDVB which selects bytes based just

+ // on the sign bit.

V0 = DAG.getBitcast(VT, V0);

V1 = DAG.getBitcast(VT, V1);

Sel = DAG.getBitcast(VT, Sel);

- return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

+ return DAG.getBitcast(SelVT,

+ DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

}

// On pre-SSE41 targets we test for the sign bit by comparing to

// zero - a negative value will set all bits of the lanes to true

@@ -27035,14 +27688,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

- // On SSE41 targets we make use of the fact that VSELECT lowers

- // to PBLENDVB which selects bytes based just on the sign bit.

+ // On SSE41 targets we can use PBLENDVB which selects bytes based just on

+ // the sign bit.

if (UseSSE41) {

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

V0 = DAG.getBitcast(ExtVT, V0);

V1 = DAG.getBitcast(ExtVT, V1);

Sel = DAG.getBitcast(ExtVT, Sel);

- return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));

+ return DAG.getBitcast(

+ VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

}

// On pre-SSE41 targets we splat the sign bit - a negative value will

// set all bits of the lanes to true and VSELECT uses that in

@@ -27093,7 +27747,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

// Decompose 256-bit shifts into 128-bit shifts.

if (VT.is256BitVector())

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

+ if (VT == MVT::v32i16 || VT == MVT::v64i8)

+ return splitVectorIntBinary(Op, DAG);

return SDValue();

}

@@ -27111,28 +27768,21 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

int NumElts = VT.getVectorNumElements();

// Check for constant splat rotation amount.

- APInt UndefElts;

- SmallVector<APInt, 32> EltBits;

- int CstSplatIndex = -1;

- if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))

- for (int i = 0; i != NumElts; ++i)

- if (!UndefElts[i]) {

- if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {

- CstSplatIndex = i;

- continue;

- }

- CstSplatIndex = -1;

- break;

- }

+ APInt CstSplatValue;

+ bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

+ // Check for splat rotate by zero.

+ if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

+ return R;

// AVX512 implicitly uses modulo rotation amounts.

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

// Attempt to rotate by immediate.

- if (0 <= CstSplatIndex) {

- unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);

- uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);

- return DAG.getNode(Op, DL, VT, R,

- DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

+ if (IsCstSplat) {

+ unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);

+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

+ return DAG.getNode(RotOpc, DL, VT, R,

+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));

}

// Else, fall-back on VPROLV/VPRORV.

@@ -27146,14 +27796,14 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

// XOP implicitly uses modulo rotation amounts.

if (Subtarget.hasXOP()) {

if (VT.is256BitVector())

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

// Attempt to rotate by immediate.

- if (0 <= CstSplatIndex) {

- uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);

+ if (IsCstSplat) {

+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

- DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));

}

// Use general rotate by variable (per-element).

@@ -27162,7 +27812,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

// Split 256-bit integers on pre-AVX2 targets.

if (VT.is256BitVector() && !Subtarget.hasAVX2())

- return split256IntArith(Op, DAG);

+ return splitVectorIntBinary(Op, DAG);

assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&

@@ -27170,7 +27820,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

"Only vXi32/vXi16/vXi8 vector rotates supported");

// Rotate by an uniform constant - expand back to shifts.

- if (0 <= CstSplatIndex)

+ if (IsCstSplat)

return SDValue();

bool IsSplatAmt = DAG.isSplatValue(Amt);

@@ -27186,12 +27836,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

if (Subtarget.hasSSE41()) {

- // On SSE41 targets we make use of the fact that VSELECT lowers

- // to PBLENDVB which selects bytes based just on the sign bit.

+ // On SSE41 targets we can use PBLENDVB which selects bytes based just

+ // on the sign bit.

V0 = DAG.getBitcast(VT, V0);

V1 = DAG.getBitcast(VT, V1);

Sel = DAG.getBitcast(VT, Sel);

- return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));

+ return DAG.getBitcast(SelVT,

+ DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

}

// On pre-SSE41 targets we test for the sign bit by comparing to

// zero - a negative value will set all bits of the lanes to true

@@ -27303,15 +27954,14 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

return false;

}

-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

-// TODO: In 32-bit mode, use FISTP when X87 is available?

bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

Type *MemType = SI->getValueOperand()->getType();

bool NoImplicitFloatOps =

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

- !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())

+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

+ (Subtarget.hasSSE1() || Subtarget.hasX87()))

return false;

return needsCmpXchgNb(MemType);

@@ -27330,7 +27980,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

- (Subtarget.hasSSE2() || Subtarget.hasX87()))

+ (Subtarget.hasSSE1() || Subtarget.hasX87()))

return AtomicExpansionKind::None;

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

@@ -27396,7 +28046,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

AI->use_empty())

return nullptr;

- auto Builder = IRBuilder<>(AI);

+ IRBuilder<> Builder(AI);

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

auto SSID = AI->getSyncScopeID();

// We must restrict the ordering to avoid generating loads with Release or

@@ -27438,7 +28088,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

// Finally we can emit the atomic load.

LoadInst *Loaded =

Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),

- AI->getType()->getPrimitiveSizeInBits());

+ Align(AI->getType()->getPrimitiveSizeInBits()));

Loaded->setAtomic(Order, SSID);

AI->replaceAllUsesWith(Loaded);

AI->eraseFromParent();

@@ -27633,18 +28283,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

}

- // Custom splitting for BWI types when AVX512F is available but BWI isn't.

- if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&

- DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {

- SDLoc dl(Op);

- SDValue Lo, Hi;

- std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);

- MVT CastVT = DstVT.getHalfNumVectorElementsVT();

- Lo = DAG.getBitcast(CastVT, Lo);

- Hi = DAG.getBitcast(CastVT, Hi);

- return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);

- }

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");

@@ -27828,11 +28466,11 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

// Decompose 256-bit ops into smaller 128-bit ops.

if (VT.is256BitVector() && !Subtarget.hasInt256())

- return Lower256IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

// Decompose 512-bit ops into smaller 256-bit ops.

if (VT.is512BitVector() && !Subtarget.hasBWI())

- return Lower512IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

// For element types greater than i8, do vXi8 pop counts and a bytesum.

if (VT.getScalarType() != MVT::i8) {

@@ -27876,7 +28514,7 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

// Decompose 256-bit ops into smaller 128-bit ops.

if (VT.is256BitVector())

- return Lower256IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

assert(VT.is128BitVector() &&

"Only 128-bit vector bitreverse lowering supported.");

@@ -27913,12 +28551,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

SDValue In = Op.getOperand(0);

SDLoc DL(Op);

- // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB

- // lowering.

- if (VT == MVT::v8i64 || VT == MVT::v16i32) {

- assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");

- return Lower512IntUnary(Op, DAG);

- }

+ // Split v64i8 without BWI so that we can still use the PSHUFB lowering.

+ if (VT == MVT::v64i8 && !Subtarget.hasBWI())

+ return splitVectorIntUnary(Op, DAG);

unsigned NumElts = VT.getVectorNumElements();

assert(VT.getScalarType() == MVT::i8 &&

@@ -27926,7 +28561,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

if (VT.is256BitVector() && !Subtarget.hasInt256())

- return Lower256IntUnary(Op, DAG);

+ return splitVectorIntUnary(Op, DAG);

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

// two nibbles and a PSHUFB lookup to find the bitreverse of each

@@ -28070,28 +28705,54 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

return Op;

if (VT == MVT::i64 && !IsTypeLegal) {

- // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.

- // FIXME: Use movlps with SSE1.

- // FIXME: Use fist with X87.

+ // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

+ // is enabled.

bool NoImplicitFloatOps =

DAG.getMachineFunction().getFunction().hasFnAttribute(

Attribute::NoImplicitFloat);

- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

- Subtarget.hasSSE2()) {

- SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

- Node->getOperand(2));

- SDVTList Tys = DAG.getVTList(MVT::Other);

- SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };

- SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,

- Ops, MVT::i64,

- Node->getMemOperand());

+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

+ SDValue Chain;

+ if (Subtarget.hasSSE1()) {

+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

+ Node->getOperand(2));

+ MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

+ SclToVec = DAG.getBitcast(StVT, SclToVec);

+ SDVTList Tys = DAG.getVTList(MVT::Other);

+ SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

+ Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

+ MVT::i64, Node->getMemOperand());

+ } else if (Subtarget.hasX87()) {

+ // First load this into an 80-bit X87 register using a stack temporary.

+ // This will put the whole integer into the significand.

+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

+ Chain =

+ DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

+ MPI, /*Align*/ 0, MachineMemOperand::MOStore);

+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

+ SDValue LdOps[] = {Chain, StackPtr};

+ SDValue Value =

+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

+ /*Align*/ None, MachineMemOperand::MOLoad);

+ Chain = Value.getValue(1);

+ // Now use an FIST to do the atomic store.

+ SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

+ Chain =

+ DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

+ StoreOps, MVT::i64, Node->getMemOperand());

+ }

- // If this is a sequentially consistent store, also emit an appropriate

- // barrier.

- if (IsSeqCst)

- Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

+ if (Chain) {

+ // If this is a sequentially consistent store, also emit an appropriate

+ // barrier.

+ if (IsSeqCst)

+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

- return Chain;

+ return Chain;

+ }

}

@@ -28120,9 +28781,8 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

// Set the carry flag.

SDValue Carry = Op.getOperand(2);

EVT CarryVT = Carry.getValueType();

- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

- Carry, DAG.getConstant(NegOne, DL, CarryVT));

+ Carry, DAG.getAllOnesConstant(DL, CarryVT));

unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;

SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),

@@ -28167,7 +28827,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

- : (Type *)VectorType::get(ArgTy, 4);

+ : (Type *)FixedVectorType::get(ArgTy, 4);

TargetLowering::CallLoweringInfo CLI(DAG);

CLI.setDebugLoc(dl)

@@ -28264,17 +28924,15 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

- SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);

+ SDVTList VTs = DAG.getVTList(MVT::Other);

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(

- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());

- return SDValue(NewScatter.getNode(), 1);

+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

+ N->getMemoryVT(), N->getMemOperand());

}

return SDValue();

}

MVT IndexVT = Index.getSimpleValueType();

- MVT MaskVT = Mask.getSimpleValueType();

// If the index is v2i32, we're being called by type legalization and we

// should just let the default handling take care of it.

@@ -28292,18 +28950,17 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

- MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

Src = ExtendToType(Src, VT, DAG);

Index = ExtendToType(Index, IndexVT, DAG);

Mask = ExtendToType(Mask, MaskVT, DAG, true);

}

- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);

+ SDVTList VTs = DAG.getVTList(MVT::Other);

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(

- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());

- return SDValue(NewScatter.getNode(), 1);

+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

+ N->getMemoryVT(), N->getMemOperand());

}

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

@@ -28329,8 +28986,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

N->isExpandingLoad());

// Emit a blend.

- SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,

- PassThru);

+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

}

@@ -28366,10 +29022,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

N->getExtensionType(), N->isExpandingLoad());

- SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,

- NewLoad.getValue(0),

- DAG.getIntPtrConstant(0, dl));

- SDValue RetOps[] = {Exract, NewLoad.getValue(1)};

+ SDValue Extract =

+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

+ DAG.getIntPtrConstant(0, dl));

+ SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

return DAG.getMergeValues(RetOps, dl);

}

@@ -28427,7 +29083,6 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

SDValue Mask = N->getMask();

SDValue PassThru = N->getPassThru();

MVT IndexVT = Index.getSimpleValueType();

- MVT MaskVT = Mask.getSimpleValueType();

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

@@ -28448,7 +29103,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

- MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

PassThru = ExtendToType(PassThru, VT, DAG);

Index = ExtendToType(Index, IndexVT, DAG);

@@ -28457,12 +29112,12 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

N->getScale() };

- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

- DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),

+ SDValue NewGather = DAG.getMemIntrinsicNode(

+ X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

N->getMemOperand());

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

NewGather, DAG.getIntPtrConstant(0, dl));

- return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);

+ return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

}

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

@@ -28528,6 +29183,20 @@ SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,

return Tmp.first;

}

+// Custom split CVTPS2PH with wide types.

+static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

+ SDLoc dl(Op);

+ EVT VT = Op.getValueType();

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

+ EVT LoVT, HiVT;

+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

+ SDValue RC = Op.getOperand(1);

+ Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

+ Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

/// Provide custom lowering hooks for some operations.

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

switch (Op.getOpcode()) {

@@ -28581,14 +29250,21 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

case ISD::FP_ROUND:

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

+ case ISD::FP16_TO_FP:

+ case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

+ case ISD::FP_TO_FP16:

+ case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

case ISD::FADD:

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

+ case ISD::FROUND: return LowerFROUND(Op, DAG);

case ISD::FABS:

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

+ case ISD::LRINT:

+ case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

case ISD::SETCC:

case ISD::STRICT_FSETCC:

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

@@ -28656,8 +29332,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

case ISD::GC_TRANSITION_START:

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

- case ISD::ADDRSPACECAST:

- return LowerADDRSPACECAST(Op, DAG);

+ case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

+ case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

}

@@ -28703,6 +29379,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

N->dump(&DAG);

#endif

llvm_unreachable("Do not know how to custom type legalize this operation!");

+ case X86ISD::CVTPH2PS: {

+ EVT VT = N->getValueType(0);

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

+ EVT LoVT, HiVT;

+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

+ Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

+ Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

+ Results.push_back(Res);

+ return;

+ }

+ case X86ISD::STRICT_CVTPH2PS: {

+ EVT VT = N->getValueType(0);

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

+ EVT LoVT, HiVT;

+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

+ Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

+ {N->getOperand(0), Lo});

+ Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

+ {N->getOperand(0), Hi});

+ SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

+ Lo.getValue(1), Hi.getValue(1));

+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

+ Results.push_back(Res);

+ Results.push_back(Chain);

+ return;

+ }

case ISD::CTPOP: {

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");

// Use a v2i64 if possible.

@@ -28772,7 +29477,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

return;

}

case ISD::ABS: {

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

assert(N->getValueType(0) == MVT::i64 &&

"Unexpected type (!= i64) on ABS.");

MVT HalfT = MVT::i32;

@@ -28785,15 +29489,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

DAG.getConstant(1, dl, HalfT));

Tmp = DAG.getNode(

ISD::SRA, dl, HalfT, Hi,

- DAG.getConstant(HalfT.getSizeInBits() - 1, dl,

- TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));

+ DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));

Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);

Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,

SDValue(Lo.getNode(), 1));

Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);

Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);

- Results.push_back(Lo);

- Results.push_back(Hi);

+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));

return;

}

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

@@ -29145,6 +29847,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

}

return;

}

+ case ISD::LRINT:

+ case ISD::LLRINT: {

+ if (SDValue V = LRINT_LLRINTHelper(N, DAG))

+ Results.push_back(V);

+ return;

+ }

case ISD::SINT_TO_FP:

case ISD::STRICT_SINT_TO_FP:

case ISD::UINT_TO_FP:

@@ -29182,14 +29891,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

for (int i = 0; i != 2; ++i) {

- SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

SignSrc, DAG.getIntPtrConstant(i, dl));

if (IsStrict)

SignCvts[i] =

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

- {N->getOperand(0), Src});

+ {N->getOperand(0), Elt});

else

- SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);

+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

};

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

SDValue Slow, Chain;

@@ -29269,7 +29978,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

Results.push_back(V.getValue(1));

return;

}

- case ISD::FP_EXTEND: {

+ case ISD::FP_EXTEND:

+ case ISD::STRICT_FP_EXTEND: {

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

// No other ValueType for FP_EXTEND should reach this point.

assert(N->getValueType(0) == MVT::v2f32 &&

@@ -29391,15 +30101,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

Attribute::NoImplicitFloat);

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

auto *Node = cast<AtomicSDNode>(N);

- if (Subtarget.hasSSE2()) {

- // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the

- // lower 64-bits.

- SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);

+ if (Subtarget.hasSSE1()) {

+ // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

+ // Then extract the lower 64-bits.

+ MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

+ SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

MVT::i64, Node->getMemOperand());

- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

+ if (Subtarget.hasSSE2()) {

+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

+ DAG.getIntPtrConstant(0, dl));

+ Results.push_back(Res);

+ Results.push_back(Ld.getValue(1));

+ return;

+ }

+ // We use an alternative sequence for SSE1 that extracts as v2f32 and

+ // then casts to i64. This avoids a 128-bit stack temporary being

+ // created by type legalization if we were to cast v4f32->v2i64.

+ SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

DAG.getIntPtrConstant(0, dl));

+ Res = DAG.getBitcast(MVT::i64, Res);

Results.push_back(Res);

Results.push_back(Ld.getValue(1));

return;

@@ -29407,14 +30129,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

if (Subtarget.hasX87()) {

// First load this into an 80-bit X87 register. This will put the whole

// integer into the significand.

- // FIXME: Do we need to glue? See FIXME comment in BuildFILD.

- SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);

+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,

+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

dl, Tys, Ops, MVT::i64,

Node->getMemOperand());

SDValue Chain = Result.getValue(1);

- SDValue InFlag = Result.getValue(2);

// Now store the X87 register to a stack temporary and convert to i64.

// This store is not atomic and doesn't need to be.

@@ -29424,11 +30144,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

MachinePointerInfo MPI =

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

- SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };

- Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,

- DAG.getVTList(MVT::Other), StoreOps,

- MVT::i64, MPI, 0 /*Align*/,

- MachineMemOperand::MOStore);

+ SDValue StoreOps[] = { Chain, Result, StackPtr };

+ Chain = DAG.getMemIntrinsicNode(

+ X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

+ MPI, None /*Align*/, MachineMemOperand::MOStore);

// Finally load the value back from the stack temporary and return it.

// This load is not atomic and doesn't need to be.

@@ -29477,24 +30196,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

return;

}

- // Custom splitting for BWI types when AVX512F is available but BWI isn't.

- if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&

- SrcVT.isVector() && isTypeLegal(SrcVT)) {

- SDValue Lo, Hi;

- std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

- MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;

- Lo = DAG.getBitcast(CastVT, Lo);

- Hi = DAG.getBitcast(CastVT, Hi);

- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);

- Results.push_back(Res);

- return;

- }

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

+ // FIXME: Use v4f32 for SSE1?

+ assert(Subtarget.hasSSE2() && "Requires SSE2");

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&

"Unexpected type action!");

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

- SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));

+ SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

+ N->getOperand(0));

+ Res = DAG.getBitcast(WideVT, Res);

Results.push_back(Res);

return;

}

@@ -29526,11 +30236,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

}

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

Gather->getBasePtr(), Index, Gather->getScale() };

- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

- DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,

- Gather->getMemoryVT(), Gather->getMemOperand());

+ SDValue Res = DAG.getMemIntrinsicNode(

+ X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

+ Gather->getMemoryVT(), Gather->getMemOperand());

Results.push_back(Res);

- Results.push_back(Res.getValue(2));

+ Results.push_back(Res.getValue(1));

return;

}

return;

@@ -29549,7 +30259,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

if (Subtarget.hasSSE2()) {

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

- Ld->getPointerInfo(), Ld->getAlignment(),

+ Ld->getPointerInfo(), Ld->getOriginalAlign(),

Ld->getMemOperand()->getFlags());

SDValue Chain = Res.getValue(1);

MVT VecVT = MVT::getVectorVT(LdVT, 2);

@@ -29570,25 +30280,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

return;

}

case ISD::ADDRSPACECAST: {

- SDValue Src = N->getOperand(0);

- EVT DstVT = N->getValueType(0);

- AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);

- unsigned SrcAS = CastN->getSrcAddressSpace();

- assert(SrcAS != CastN->getDestAddressSpace() &&

- "addrspacecast must be between different address spaces");

- SDValue Res;

- if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)

- Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

- else if (DstVT == MVT::i64)

- Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

- else if (DstVT == MVT::i32)

- Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

- else

- report_fatal_error("Unrecognized addrspacecast type legalization");

- Results.push_back(Res);

+ SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

+ Results.push_back(V);

return;

}

@@ -29597,362 +30290,367 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

switch ((X86ISD::NodeType)Opcode) {

case X86ISD::FIRST_NUMBER: break;

- case X86ISD::BSF: return "X86ISD::BSF";

- case X86ISD::BSR: return "X86ISD::BSR";

- case X86ISD::SHLD: return "X86ISD::SHLD";

- case X86ISD::SHRD: return "X86ISD::SHRD";

- case X86ISD::FAND: return "X86ISD::FAND";

- case X86ISD::FANDN: return "X86ISD::FANDN";

- case X86ISD::FOR: return "X86ISD::FOR";

- case X86ISD::FXOR: return "X86ISD::FXOR";

- case X86ISD::FILD: return "X86ISD::FILD";

- case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";

- case X86ISD::FIST: return "X86ISD::FIST";

- case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";

- case X86ISD::FLD: return "X86ISD::FLD";

- case X86ISD::FST: return "X86ISD::FST";

- case X86ISD::CALL: return "X86ISD::CALL";

- case X86ISD::BT: return "X86ISD::BT";

- case X86ISD::CMP: return "X86ISD::CMP";

- case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";

- case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";

- case X86ISD::COMI: return "X86ISD::COMI";

- case X86ISD::UCOMI: return "X86ISD::UCOMI";

- case X86ISD::CMPM: return "X86ISD::CMPM";

- case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";

- case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";

- case X86ISD::SETCC: return "X86ISD::SETCC";

- case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";

- case X86ISD::FSETCC: return "X86ISD::FSETCC";

- case X86ISD::FSETCCM: return "X86ISD::FSETCCM";

- case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";

- case X86ISD::CMOV: return "X86ISD::CMOV";

- case X86ISD::BRCOND: return "X86ISD::BRCOND";

- case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";

- case X86ISD::IRET: return "X86ISD::IRET";

- case X86ISD::REP_STOS: return "X86ISD::REP_STOS";

- case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";

- case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";

- case X86ISD::Wrapper: return "X86ISD::Wrapper";

- case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";

- case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";

- case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";

- case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";

- case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";

- case X86ISD::PEXTRB: return "X86ISD::PEXTRB";

- case X86ISD::PEXTRW: return "X86ISD::PEXTRW";

- case X86ISD::INSERTPS: return "X86ISD::INSERTPS";

- case X86ISD::PINSRB: return "X86ISD::PINSRB";

- case X86ISD::PINSRW: return "X86ISD::PINSRW";

- case X86ISD::PSHUFB: return "X86ISD::PSHUFB";

- case X86ISD::ANDNP: return "X86ISD::ANDNP";

- case X86ISD::BLENDI: return "X86ISD::BLENDI";

- case X86ISD::BLENDV: return "X86ISD::BLENDV";

- case X86ISD::HADD: return "X86ISD::HADD";

- case X86ISD::HSUB: return "X86ISD::HSUB";

- case X86ISD::FHADD: return "X86ISD::FHADD";

- case X86ISD::FHSUB: return "X86ISD::FHSUB";

- case X86ISD::CONFLICT: return "X86ISD::CONFLICT";

- case X86ISD::FMAX: return "X86ISD::FMAX";

- case X86ISD::FMAXS: return "X86ISD::FMAXS";

- case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";

- case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";

- case X86ISD::FMIN: return "X86ISD::FMIN";

- case X86ISD::FMINS: return "X86ISD::FMINS";

- case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";

- case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";

- case X86ISD::FMAXC: return "X86ISD::FMAXC";

- case X86ISD::FMINC: return "X86ISD::FMINC";

- case X86ISD::FRSQRT: return "X86ISD::FRSQRT";

- case X86ISD::FRCP: return "X86ISD::FRCP";

- case X86ISD::EXTRQI: return "X86ISD::EXTRQI";

- case X86ISD::INSERTQI: return "X86ISD::INSERTQI";

- case X86ISD::TLSADDR: return "X86ISD::TLSADDR";

- case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";

- case X86ISD::TLSCALL: return "X86ISD::TLSCALL";

- case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";

- case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";

- case X86ISD::EH_SJLJ_SETUP_DISPATCH:

- return "X86ISD::EH_SJLJ_SETUP_DISPATCH";

- case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";

- case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";

- case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";

- case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";

- case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";

- case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";

- case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";

- case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:

- return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";

- case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:

- return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";

- case X86ISD::LADD: return "X86ISD::LADD";

- case X86ISD::LSUB: return "X86ISD::LSUB";

- case X86ISD::LOR: return "X86ISD::LOR";

- case X86ISD::LXOR: return "X86ISD::LXOR";

- case X86ISD::LAND: return "X86ISD::LAND";

- case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";

- case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";

- case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";

- case X86ISD::VTRUNC: return "X86ISD::VTRUNC";

- case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";

- case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";

- case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";

- case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";

- case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";

- case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";

- case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";

- case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";

- case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";

- case X86ISD::VFPEXT: return "X86ISD::VFPEXT";

- case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";

- case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";

- case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";

- case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";

- case X86ISD::VFPROUND: return "X86ISD::VFPROUND";

- case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";

- case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";

- case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";

- case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";

- case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";

- case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";

- case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";

- case X86ISD::VSHL: return "X86ISD::VSHL";

- case X86ISD::VSRL: return "X86ISD::VSRL";

- case X86ISD::VSRA: return "X86ISD::VSRA";

- case X86ISD::VSHLI: return "X86ISD::VSHLI";

- case X86ISD::VSRLI: return "X86ISD::VSRLI";

- case X86ISD::VSRAI: return "X86ISD::VSRAI";

- case X86ISD::VSHLV: return "X86ISD::VSHLV";

- case X86ISD::VSRLV: return "X86ISD::VSRLV";

- case X86ISD::VSRAV: return "X86ISD::VSRAV";

- case X86ISD::VROTLI: return "X86ISD::VROTLI";

- case X86ISD::VROTRI: return "X86ISD::VROTRI";

- case X86ISD::VPPERM: return "X86ISD::VPPERM";

- case X86ISD::CMPP: return "X86ISD::CMPP";

- case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";

- case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";

- case X86ISD::PCMPGT: return "X86ISD::PCMPGT";

- case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";

- case X86ISD::ADD: return "X86ISD::ADD";

- case X86ISD::SUB: return "X86ISD::SUB";

- case X86ISD::ADC: return "X86ISD::ADC";

- case X86ISD::SBB: return "X86ISD::SBB";

- case X86ISD::SMUL: return "X86ISD::SMUL";

- case X86ISD::UMUL: return "X86ISD::UMUL";

- case X86ISD::OR: return "X86ISD::OR";

- case X86ISD::XOR: return "X86ISD::XOR";

- case X86ISD::AND: return "X86ISD::AND";

- case X86ISD::BEXTR: return "X86ISD::BEXTR";

- case X86ISD::BZHI: return "X86ISD::BZHI";

- case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";

- case X86ISD::MOVMSK: return "X86ISD::MOVMSK";

- case X86ISD::PTEST: return "X86ISD::PTEST";

- case X86ISD::TESTP: return "X86ISD::TESTP";

- case X86ISD::KORTEST: return "X86ISD::KORTEST";

- case X86ISD::KTEST: return "X86ISD::KTEST";

- case X86ISD::KADD: return "X86ISD::KADD";

- case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";

- case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";

- case X86ISD::PACKSS: return "X86ISD::PACKSS";

- case X86ISD::PACKUS: return "X86ISD::PACKUS";

- case X86ISD::PALIGNR: return "X86ISD::PALIGNR";

- case X86ISD::VALIGN: return "X86ISD::VALIGN";

- case X86ISD::VSHLD: return "X86ISD::VSHLD";

- case X86ISD::VSHRD: return "X86ISD::VSHRD";

- case X86ISD::VSHLDV: return "X86ISD::VSHLDV";

- case X86ISD::VSHRDV: return "X86ISD::VSHRDV";

- case X86ISD::PSHUFD: return "X86ISD::PSHUFD";

- case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";

- case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";

- case X86ISD::SHUFP: return "X86ISD::SHUFP";

- case X86ISD::SHUF128: return "X86ISD::SHUF128";

- case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";

- case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";

- case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";

- case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";

- case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";

- case X86ISD::MOVSD: return "X86ISD::MOVSD";

- case X86ISD::MOVSS: return "X86ISD::MOVSS";

- case X86ISD::UNPCKL: return "X86ISD::UNPCKL";

- case X86ISD::UNPCKH: return "X86ISD::UNPCKH";

- case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";

- case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";

- case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";

- case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";

- case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";

- case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";

- case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";

- case X86ISD::VPERMV: return "X86ISD::VPERMV";

- case X86ISD::VPERMV3: return "X86ISD::VPERMV3";

- case X86ISD::VPERMI: return "X86ISD::VPERMI";

- case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";

- case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";

- case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";

- case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";

- case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";

- case X86ISD::VRANGE: return "X86ISD::VRANGE";

- case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";

- case X86ISD::VRANGES: return "X86ISD::VRANGES";

- case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";

- case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";

- case X86ISD::PMULDQ: return "X86ISD::PMULDQ";

- case X86ISD::PSADBW: return "X86ISD::PSADBW";

- case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";

- case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";

- case X86ISD::VAARG_64: return "X86ISD::VAARG_64";

- case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";

- case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";

- case X86ISD::MFENCE: return "X86ISD::MFENCE";

- case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";

- case X86ISD::SAHF: return "X86ISD::SAHF";

- case X86ISD::RDRAND: return "X86ISD::RDRAND";

- case X86ISD::RDSEED: return "X86ISD::RDSEED";

- case X86ISD::RDPKRU: return "X86ISD::RDPKRU";

- case X86ISD::WRPKRU: return "X86ISD::WRPKRU";

- case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";

- case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";

- case X86ISD::VPSHA: return "X86ISD::VPSHA";

- case X86ISD::VPSHL: return "X86ISD::VPSHL";

- case X86ISD::VPCOM: return "X86ISD::VPCOM";

- case X86ISD::VPCOMU: return "X86ISD::VPCOMU";

- case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";

- case X86ISD::FMSUB: return "X86ISD::FMSUB";

- case X86ISD::FNMADD: return "X86ISD::FNMADD";

- case X86ISD::FNMSUB: return "X86ISD::FNMSUB";

- case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";

- case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";

- case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";

- case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";

- case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";

- case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";

- case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";

- case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";

- case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";

- case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";

- case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";

- case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";

- case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";

- case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";

- case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";

- case X86ISD::VREDUCE: return "X86ISD::VREDUCE";

- case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";

- case X86ISD::VREDUCES: return "X86ISD::VREDUCES";

- case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";

- case X86ISD::VGETMANT: return "X86ISD::VGETMANT";

- case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";

- case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";

- case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";

- case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";

- case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";

- case X86ISD::XTEST: return "X86ISD::XTEST";

- case X86ISD::COMPRESS: return "X86ISD::COMPRESS";

- case X86ISD::EXPAND: return "X86ISD::EXPAND";

- case X86ISD::SELECTS: return "X86ISD::SELECTS";

- case X86ISD::ADDSUB: return "X86ISD::ADDSUB";

- case X86ISD::RCP14: return "X86ISD::RCP14";

- case X86ISD::RCP14S: return "X86ISD::RCP14S";

- case X86ISD::RCP28: return "X86ISD::RCP28";

- case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";

- case X86ISD::RCP28S: return "X86ISD::RCP28S";

- case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";

- case X86ISD::EXP2: return "X86ISD::EXP2";

- case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";

- case X86ISD::RSQRT14: return "X86ISD::RSQRT14";

- case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";

- case X86ISD::RSQRT28: return "X86ISD::RSQRT28";

- case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";

- case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";

- case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";

- case X86ISD::FADD_RND: return "X86ISD::FADD_RND";

- case X86ISD::FADDS: return "X86ISD::FADDS";

- case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";

- case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";

- case X86ISD::FSUBS: return "X86ISD::FSUBS";

- case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";

- case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";

- case X86ISD::FMULS: return "X86ISD::FMULS";

- case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";

- case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";

- case X86ISD::FDIVS: return "X86ISD::FDIVS";

- case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";

- case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";

- case X86ISD::FSQRTS: return "X86ISD::FSQRTS";

- case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";

- case X86ISD::FGETEXP: return "X86ISD::FGETEXP";

- case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";

- case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";

- case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";

- case X86ISD::SCALEF: return "X86ISD::SCALEF";

- case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";

- case X86ISD::SCALEFS: return "X86ISD::SCALEFS";

- case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";

- case X86ISD::AVG: return "X86ISD::AVG";

- case X86ISD::MULHRS: return "X86ISD::MULHRS";

- case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";

- case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";

- case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";

- case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";

- case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";

- case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";

- case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";

- case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";

- case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";

- case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";

- case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";

- case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";

- case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";

- case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";

- case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";

- case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";

- case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";

- case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";

- case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";

- case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";

- case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";

- case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";

- case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";

- case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";

- case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";

- case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";

- case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";

- case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";

- case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";

- case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";

- case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";

- case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";

- case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";

- case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";

- case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";

- case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";

- case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";

- case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";

- case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";

- case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";

- case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";

- case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";

- case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";

- case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";

- case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";

- case X86ISD::LWPINS: return "X86ISD::LWPINS";

- case X86ISD::MGATHER: return "X86ISD::MGATHER";

- case X86ISD::MSCATTER: return "X86ISD::MSCATTER";

- case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";

- case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";

- case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";

- case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";

- case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";

- case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";

- case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";

- case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";

- case X86ISD::NT_CALL: return "X86ISD::NT_CALL";

- case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";

- case X86ISD::UMWAIT: return "X86ISD::UMWAIT";

- case X86ISD::TPAUSE: return "X86ISD::TPAUSE";

- case X86ISD::ENQCMD: return "X86ISD:ENQCMD";

- case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";

- case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";

+#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

+ NODE_NAME_CASE(BSF)

+ NODE_NAME_CASE(BSR)

+ NODE_NAME_CASE(FSHL)

+ NODE_NAME_CASE(FSHR)

+ NODE_NAME_CASE(FAND)

+ NODE_NAME_CASE(FANDN)

+ NODE_NAME_CASE(FOR)

+ NODE_NAME_CASE(FXOR)

+ NODE_NAME_CASE(FILD)

+ NODE_NAME_CASE(FIST)

+ NODE_NAME_CASE(FP_TO_INT_IN_MEM)

+ NODE_NAME_CASE(FLD)

+ NODE_NAME_CASE(FST)

+ NODE_NAME_CASE(CALL)

+ NODE_NAME_CASE(BT)

+ NODE_NAME_CASE(CMP)

+ NODE_NAME_CASE(FCMP)

+ NODE_NAME_CASE(STRICT_FCMP)

+ NODE_NAME_CASE(STRICT_FCMPS)

+ NODE_NAME_CASE(COMI)

+ NODE_NAME_CASE(UCOMI)

+ NODE_NAME_CASE(CMPM)

+ NODE_NAME_CASE(STRICT_CMPM)

+ NODE_NAME_CASE(CMPM_SAE)

+ NODE_NAME_CASE(SETCC)

+ NODE_NAME_CASE(SETCC_CARRY)

+ NODE_NAME_CASE(FSETCC)

+ NODE_NAME_CASE(FSETCCM)

+ NODE_NAME_CASE(FSETCCM_SAE)

+ NODE_NAME_CASE(CMOV)

+ NODE_NAME_CASE(BRCOND)

+ NODE_NAME_CASE(RET_FLAG)

+ NODE_NAME_CASE(IRET)

+ NODE_NAME_CASE(REP_STOS)

+ NODE_NAME_CASE(REP_MOVS)

+ NODE_NAME_CASE(GlobalBaseReg)

+ NODE_NAME_CASE(Wrapper)

+ NODE_NAME_CASE(WrapperRIP)

+ NODE_NAME_CASE(MOVQ2DQ)

+ NODE_NAME_CASE(MOVDQ2Q)

+ NODE_NAME_CASE(MMX_MOVD2W)

+ NODE_NAME_CASE(MMX_MOVW2D)

+ NODE_NAME_CASE(PEXTRB)

+ NODE_NAME_CASE(PEXTRW)

+ NODE_NAME_CASE(INSERTPS)

+ NODE_NAME_CASE(PINSRB)

+ NODE_NAME_CASE(PINSRW)

+ NODE_NAME_CASE(PSHUFB)

+ NODE_NAME_CASE(ANDNP)

+ NODE_NAME_CASE(BLENDI)

+ NODE_NAME_CASE(BLENDV)

+ NODE_NAME_CASE(HADD)

+ NODE_NAME_CASE(HSUB)

+ NODE_NAME_CASE(FHADD)

+ NODE_NAME_CASE(FHSUB)

+ NODE_NAME_CASE(CONFLICT)

+ NODE_NAME_CASE(FMAX)

+ NODE_NAME_CASE(FMAXS)

+ NODE_NAME_CASE(FMAX_SAE)

+ NODE_NAME_CASE(FMAXS_SAE)

+ NODE_NAME_CASE(FMIN)

+ NODE_NAME_CASE(FMINS)

+ NODE_NAME_CASE(FMIN_SAE)

+ NODE_NAME_CASE(FMINS_SAE)

+ NODE_NAME_CASE(FMAXC)

+ NODE_NAME_CASE(FMINC)

+ NODE_NAME_CASE(FRSQRT)

+ NODE_NAME_CASE(FRCP)

+ NODE_NAME_CASE(EXTRQI)

+ NODE_NAME_CASE(INSERTQI)

+ NODE_NAME_CASE(TLSADDR)

+ NODE_NAME_CASE(TLSBASEADDR)

+ NODE_NAME_CASE(TLSCALL)

+ NODE_NAME_CASE(EH_SJLJ_SETJMP)

+ NODE_NAME_CASE(EH_SJLJ_LONGJMP)

+ NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

+ NODE_NAME_CASE(EH_RETURN)

+ NODE_NAME_CASE(TC_RETURN)

+ NODE_NAME_CASE(FNSTCW16m)

+ NODE_NAME_CASE(LCMPXCHG_DAG)

+ NODE_NAME_CASE(LCMPXCHG8_DAG)

+ NODE_NAME_CASE(LCMPXCHG16_DAG)

+ NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)

+ NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

+ NODE_NAME_CASE(LADD)

+ NODE_NAME_CASE(LSUB)

+ NODE_NAME_CASE(LOR)

+ NODE_NAME_CASE(LXOR)

+ NODE_NAME_CASE(LAND)

+ NODE_NAME_CASE(VZEXT_MOVL)

+ NODE_NAME_CASE(VZEXT_LOAD)

+ NODE_NAME_CASE(VEXTRACT_STORE)

+ NODE_NAME_CASE(VTRUNC)

+ NODE_NAME_CASE(VTRUNCS)

+ NODE_NAME_CASE(VTRUNCUS)

+ NODE_NAME_CASE(VMTRUNC)

+ NODE_NAME_CASE(VMTRUNCS)

+ NODE_NAME_CASE(VMTRUNCUS)

+ NODE_NAME_CASE(VTRUNCSTORES)

+ NODE_NAME_CASE(VTRUNCSTOREUS)

+ NODE_NAME_CASE(VMTRUNCSTORES)

+ NODE_NAME_CASE(VMTRUNCSTOREUS)

+ NODE_NAME_CASE(VFPEXT)

+ NODE_NAME_CASE(STRICT_VFPEXT)

+ NODE_NAME_CASE(VFPEXT_SAE)

+ NODE_NAME_CASE(VFPEXTS)

+ NODE_NAME_CASE(VFPEXTS_SAE)

+ NODE_NAME_CASE(VFPROUND)

+ NODE_NAME_CASE(STRICT_VFPROUND)

+ NODE_NAME_CASE(VMFPROUND)

+ NODE_NAME_CASE(VFPROUND_RND)

+ NODE_NAME_CASE(VFPROUNDS)

+ NODE_NAME_CASE(VFPROUNDS_RND)

+ NODE_NAME_CASE(VSHLDQ)

+ NODE_NAME_CASE(VSRLDQ)

+ NODE_NAME_CASE(VSHL)

+ NODE_NAME_CASE(VSRL)

+ NODE_NAME_CASE(VSRA)

+ NODE_NAME_CASE(VSHLI)

+ NODE_NAME_CASE(VSRLI)

+ NODE_NAME_CASE(VSRAI)

+ NODE_NAME_CASE(VSHLV)

+ NODE_NAME_CASE(VSRLV)

+ NODE_NAME_CASE(VSRAV)

+ NODE_NAME_CASE(VROTLI)

+ NODE_NAME_CASE(VROTRI)

+ NODE_NAME_CASE(VPPERM)

+ NODE_NAME_CASE(CMPP)

+ NODE_NAME_CASE(STRICT_CMPP)

+ NODE_NAME_CASE(PCMPEQ)

+ NODE_NAME_CASE(PCMPGT)

+ NODE_NAME_CASE(PHMINPOS)

+ NODE_NAME_CASE(ADD)

+ NODE_NAME_CASE(SUB)

+ NODE_NAME_CASE(ADC)

+ NODE_NAME_CASE(SBB)

+ NODE_NAME_CASE(SMUL)

+ NODE_NAME_CASE(UMUL)

+ NODE_NAME_CASE(OR)

+ NODE_NAME_CASE(XOR)

+ NODE_NAME_CASE(AND)

+ NODE_NAME_CASE(BEXTR)

+ NODE_NAME_CASE(BZHI)

+ NODE_NAME_CASE(PDEP)

+ NODE_NAME_CASE(PEXT)

+ NODE_NAME_CASE(MUL_IMM)

+ NODE_NAME_CASE(MOVMSK)

+ NODE_NAME_CASE(PTEST)

+ NODE_NAME_CASE(TESTP)

+ NODE_NAME_CASE(KORTEST)

+ NODE_NAME_CASE(KTEST)

+ NODE_NAME_CASE(KADD)

+ NODE_NAME_CASE(KSHIFTL)

+ NODE_NAME_CASE(KSHIFTR)

+ NODE_NAME_CASE(PACKSS)

+ NODE_NAME_CASE(PACKUS)

+ NODE_NAME_CASE(PALIGNR)

+ NODE_NAME_CASE(VALIGN)

+ NODE_NAME_CASE(VSHLD)

+ NODE_NAME_CASE(VSHRD)

+ NODE_NAME_CASE(VSHLDV)

+ NODE_NAME_CASE(VSHRDV)

+ NODE_NAME_CASE(PSHUFD)

+ NODE_NAME_CASE(PSHUFHW)

+ NODE_NAME_CASE(PSHUFLW)

+ NODE_NAME_CASE(SHUFP)

+ NODE_NAME_CASE(SHUF128)

+ NODE_NAME_CASE(MOVLHPS)

+ NODE_NAME_CASE(MOVHLPS)

+ NODE_NAME_CASE(MOVDDUP)

+ NODE_NAME_CASE(MOVSHDUP)

+ NODE_NAME_CASE(MOVSLDUP)

+ NODE_NAME_CASE(MOVSD)

+ NODE_NAME_CASE(MOVSS)

+ NODE_NAME_CASE(UNPCKL)

+ NODE_NAME_CASE(UNPCKH)

+ NODE_NAME_CASE(VBROADCAST)

+ NODE_NAME_CASE(VBROADCAST_LOAD)

+ NODE_NAME_CASE(VBROADCASTM)

+ NODE_NAME_CASE(SUBV_BROADCAST)

+ NODE_NAME_CASE(VPERMILPV)

+ NODE_NAME_CASE(VPERMILPI)

+ NODE_NAME_CASE(VPERM2X128)

+ NODE_NAME_CASE(VPERMV)

+ NODE_NAME_CASE(VPERMV3)

+ NODE_NAME_CASE(VPERMI)

+ NODE_NAME_CASE(VPTERNLOG)

+ NODE_NAME_CASE(VFIXUPIMM)

+ NODE_NAME_CASE(VFIXUPIMM_SAE)

+ NODE_NAME_CASE(VFIXUPIMMS)

+ NODE_NAME_CASE(VFIXUPIMMS_SAE)

+ NODE_NAME_CASE(VRANGE)

+ NODE_NAME_CASE(VRANGE_SAE)

+ NODE_NAME_CASE(VRANGES)

+ NODE_NAME_CASE(VRANGES_SAE)

+ NODE_NAME_CASE(PMULUDQ)

+ NODE_NAME_CASE(PMULDQ)

+ NODE_NAME_CASE(PSADBW)

+ NODE_NAME_CASE(DBPSADBW)

+ NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

+ NODE_NAME_CASE(VAARG_64)

+ NODE_NAME_CASE(WIN_ALLOCA)

+ NODE_NAME_CASE(MEMBARRIER)

+ NODE_NAME_CASE(MFENCE)

+ NODE_NAME_CASE(SEG_ALLOCA)

+ NODE_NAME_CASE(PROBED_ALLOCA)

+ NODE_NAME_CASE(RDRAND)

+ NODE_NAME_CASE(RDSEED)

+ NODE_NAME_CASE(RDPKRU)

+ NODE_NAME_CASE(WRPKRU)

+ NODE_NAME_CASE(VPMADDUBSW)

+ NODE_NAME_CASE(VPMADDWD)

+ NODE_NAME_CASE(VPSHA)

+ NODE_NAME_CASE(VPSHL)

+ NODE_NAME_CASE(VPCOM)

+ NODE_NAME_CASE(VPCOMU)

+ NODE_NAME_CASE(VPERMIL2)

+ NODE_NAME_CASE(FMSUB)

+ NODE_NAME_CASE(STRICT_FMSUB)

+ NODE_NAME_CASE(FNMADD)

+ NODE_NAME_CASE(STRICT_FNMADD)

+ NODE_NAME_CASE(FNMSUB)

+ NODE_NAME_CASE(STRICT_FNMSUB)

+ NODE_NAME_CASE(FMADDSUB)

+ NODE_NAME_CASE(FMSUBADD)

+ NODE_NAME_CASE(FMADD_RND)

+ NODE_NAME_CASE(FNMADD_RND)

+ NODE_NAME_CASE(FMSUB_RND)

+ NODE_NAME_CASE(FNMSUB_RND)

+ NODE_NAME_CASE(FMADDSUB_RND)

+ NODE_NAME_CASE(FMSUBADD_RND)

+ NODE_NAME_CASE(VPMADD52H)

+ NODE_NAME_CASE(VPMADD52L)

+ NODE_NAME_CASE(VRNDSCALE)

+ NODE_NAME_CASE(STRICT_VRNDSCALE)

+ NODE_NAME_CASE(VRNDSCALE_SAE)

+ NODE_NAME_CASE(VRNDSCALES)

+ NODE_NAME_CASE(VRNDSCALES_SAE)

+ NODE_NAME_CASE(VREDUCE)

+ NODE_NAME_CASE(VREDUCE_SAE)

+ NODE_NAME_CASE(VREDUCES)

+ NODE_NAME_CASE(VREDUCES_SAE)

+ NODE_NAME_CASE(VGETMANT)

+ NODE_NAME_CASE(VGETMANT_SAE)

+ NODE_NAME_CASE(VGETMANTS)

+ NODE_NAME_CASE(VGETMANTS_SAE)

+ NODE_NAME_CASE(PCMPESTR)

+ NODE_NAME_CASE(PCMPISTR)

+ NODE_NAME_CASE(XTEST)

+ NODE_NAME_CASE(COMPRESS)

+ NODE_NAME_CASE(EXPAND)

+ NODE_NAME_CASE(SELECTS)

+ NODE_NAME_CASE(ADDSUB)

+ NODE_NAME_CASE(RCP14)

+ NODE_NAME_CASE(RCP14S)

+ NODE_NAME_CASE(RCP28)

+ NODE_NAME_CASE(RCP28_SAE)

+ NODE_NAME_CASE(RCP28S)

+ NODE_NAME_CASE(RCP28S_SAE)

+ NODE_NAME_CASE(EXP2)

+ NODE_NAME_CASE(EXP2_SAE)

+ NODE_NAME_CASE(RSQRT14)

+ NODE_NAME_CASE(RSQRT14S)

+ NODE_NAME_CASE(RSQRT28)

+ NODE_NAME_CASE(RSQRT28_SAE)

+ NODE_NAME_CASE(RSQRT28S)

+ NODE_NAME_CASE(RSQRT28S_SAE)

+ NODE_NAME_CASE(FADD_RND)

+ NODE_NAME_CASE(FADDS)

+ NODE_NAME_CASE(FADDS_RND)

+ NODE_NAME_CASE(FSUB_RND)

+ NODE_NAME_CASE(FSUBS)

+ NODE_NAME_CASE(FSUBS_RND)

+ NODE_NAME_CASE(FMUL_RND)

+ NODE_NAME_CASE(FMULS)

+ NODE_NAME_CASE(FMULS_RND)

+ NODE_NAME_CASE(FDIV_RND)

+ NODE_NAME_CASE(FDIVS)

+ NODE_NAME_CASE(FDIVS_RND)

+ NODE_NAME_CASE(FSQRT_RND)

+ NODE_NAME_CASE(FSQRTS)

+ NODE_NAME_CASE(FSQRTS_RND)

+ NODE_NAME_CASE(FGETEXP)

+ NODE_NAME_CASE(FGETEXP_SAE)

+ NODE_NAME_CASE(FGETEXPS)

+ NODE_NAME_CASE(FGETEXPS_SAE)

+ NODE_NAME_CASE(SCALEF)

+ NODE_NAME_CASE(SCALEF_RND)

+ NODE_NAME_CASE(SCALEFS)

+ NODE_NAME_CASE(SCALEFS_RND)

+ NODE_NAME_CASE(AVG)

+ NODE_NAME_CASE(MULHRS)

+ NODE_NAME_CASE(SINT_TO_FP_RND)

+ NODE_NAME_CASE(UINT_TO_FP_RND)

+ NODE_NAME_CASE(CVTTP2SI)

+ NODE_NAME_CASE(CVTTP2UI)

+ NODE_NAME_CASE(STRICT_CVTTP2SI)

+ NODE_NAME_CASE(STRICT_CVTTP2UI)

+ NODE_NAME_CASE(MCVTTP2SI)

+ NODE_NAME_CASE(MCVTTP2UI)

+ NODE_NAME_CASE(CVTTP2SI_SAE)

+ NODE_NAME_CASE(CVTTP2UI_SAE)

+ NODE_NAME_CASE(CVTTS2SI)

+ NODE_NAME_CASE(CVTTS2UI)

+ NODE_NAME_CASE(CVTTS2SI_SAE)

+ NODE_NAME_CASE(CVTTS2UI_SAE)

+ NODE_NAME_CASE(CVTSI2P)

+ NODE_NAME_CASE(CVTUI2P)

+ NODE_NAME_CASE(STRICT_CVTSI2P)

+ NODE_NAME_CASE(STRICT_CVTUI2P)

+ NODE_NAME_CASE(MCVTSI2P)

+ NODE_NAME_CASE(MCVTUI2P)

+ NODE_NAME_CASE(VFPCLASS)

+ NODE_NAME_CASE(VFPCLASSS)

+ NODE_NAME_CASE(MULTISHIFT)

+ NODE_NAME_CASE(SCALAR_SINT_TO_FP)

+ NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

+ NODE_NAME_CASE(SCALAR_UINT_TO_FP)

+ NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

+ NODE_NAME_CASE(CVTPS2PH)

+ NODE_NAME_CASE(STRICT_CVTPS2PH)

+ NODE_NAME_CASE(MCVTPS2PH)

+ NODE_NAME_CASE(CVTPH2PS)

+ NODE_NAME_CASE(STRICT_CVTPH2PS)

+ NODE_NAME_CASE(CVTPH2PS_SAE)

+ NODE_NAME_CASE(CVTP2SI)

+ NODE_NAME_CASE(CVTP2UI)

+ NODE_NAME_CASE(MCVTP2SI)

+ NODE_NAME_CASE(MCVTP2UI)

+ NODE_NAME_CASE(CVTP2SI_RND)

+ NODE_NAME_CASE(CVTP2UI_RND)

+ NODE_NAME_CASE(CVTS2SI)

+ NODE_NAME_CASE(CVTS2UI)

+ NODE_NAME_CASE(CVTS2SI_RND)

+ NODE_NAME_CASE(CVTS2UI_RND)

+ NODE_NAME_CASE(CVTNE2PS2BF16)

+ NODE_NAME_CASE(CVTNEPS2BF16)

+ NODE_NAME_CASE(MCVTNEPS2BF16)

+ NODE_NAME_CASE(DPBF16PS)

+ NODE_NAME_CASE(LWPINS)

+ NODE_NAME_CASE(MGATHER)

+ NODE_NAME_CASE(MSCATTER)

+ NODE_NAME_CASE(VPDPBUSD)

+ NODE_NAME_CASE(VPDPBUSDS)

+ NODE_NAME_CASE(VPDPWSSD)

+ NODE_NAME_CASE(VPDPWSSDS)

+ NODE_NAME_CASE(VPSHUFBITQMB)

+ NODE_NAME_CASE(GF2P8MULB)

+ NODE_NAME_CASE(GF2P8AFFINEQB)

+ NODE_NAME_CASE(GF2P8AFFINEINVQB)

+ NODE_NAME_CASE(NT_CALL)

+ NODE_NAME_CASE(NT_BRIND)

+ NODE_NAME_CASE(UMWAIT)

+ NODE_NAME_CASE(TPAUSE)

+ NODE_NAME_CASE(ENQCMD)

+ NODE_NAME_CASE(ENQCMDS)

+ NODE_NAME_CASE(VP2INTERSECT)

}

return nullptr;

+#undef NODE_NAME_CASE

}

/// Return true if the addressing mode represented by AM is legal for this

@@ -30018,7 +30716,8 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

return false;

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

- if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&

+ // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

+ if (Subtarget.hasXOP() &&

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

return false;

@@ -30104,7 +30803,7 @@ bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

}

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

- if (!VT1.isInteger() || !VT2.isInteger())

+ if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

return false;

unsigned NumBits1 = VT1.getSizeInBits();

unsigned NumBits2 = VT2.getSizeInBits();

@@ -30145,6 +30844,39 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

return false;

}

+bool X86TargetLowering::shouldSinkOperands(Instruction *I,

+ SmallVectorImpl<Use *> &Ops) const {

+ // A uniform shift amount in a vector shift or funnel shift may be much

+ // cheaper than a generic variable vector shift, so make that pattern visible

+ // to SDAG by sinking the shuffle instruction next to the shift.

+ int ShiftAmountOpNum = -1;

+ if (I->isShift())

+ ShiftAmountOpNum = 1;

+ else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

+ if (II->getIntrinsicID() == Intrinsic::fshl ||

+ II->getIntrinsicID() == Intrinsic::fshr)

+ ShiftAmountOpNum = 2;

+ }

+ if (ShiftAmountOpNum == -1)

+ return false;

+ auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

+ if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

+ isVectorShiftByScalarCheap(I->getType())) {

+ Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

+ return true;

+ }

+ return false;

+bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

+ if (!Subtarget.is64Bit())

+ return false;

+ return TargetLowering::shouldConvertPhiType(From, To);

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

return false;

@@ -30188,7 +30920,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

/// VECTOR_SHUFFLE operations, those with specific masks.

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

/// are assumed to be legal.

-bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {

+bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

if (!VT.isSimple())

return false;

@@ -30218,8 +30950,8 @@ bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

}

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

- // If the subtarget is using retpolines, we need to not generate jump tables.

- if (Subtarget.useRetpolineIndirectBranches())

+ // If the subtarget is using thunks, we need to not generate jump tables.

+ if (Subtarget.useIndirectThunkBranches())

return false;

// Otherwise, fallback on the generic logic.

@@ -30333,7 +31065,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,

MachineOperand &Segment = MI.getOperand(5);

unsigned ArgSize = MI.getOperand(6).getImm();

unsigned ArgMode = MI.getOperand(7).getImm();

- unsigned Align = MI.getOperand(8).getImm();

+ Align Alignment = Align(MI.getOperand(8).getImm());

MachineFunction *MF = MBB->getParent();

@@ -30373,7 +31105,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,

/* Align ArgSize to a multiple of 8 */

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

- bool NeedsAlign = (Align > 8);

+ bool NeedsAlign = (Alignment > 8);

MachineBasicBlock *thisMBB = MBB;

MachineBasicBlock *overflowMBB;

@@ -30521,17 +31253,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,

// to OverflowDestReg.

if (NeedsAlign) {

// Align the overflow address

- assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");

// aligned_addr = (addr + (align-1)) & ~(align-1)

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)

- .addReg(OverflowAddrReg)

- .addImm(Align-1);

+ .addReg(OverflowAddrReg)

+ .addImm(Alignment.value() - 1);

BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)

- .addReg(TmpReg)

- .addImm(~(uint64_t)(Align-1));

+ .addReg(TmpReg)

+ .addImm(~(uint64_t)(Alignment.value() - 1));

} else {

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

.addReg(OverflowAddrReg);

@@ -30627,7 +31358,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(

MachineMemOperand *MMO = F->getMachineMemOperand(

MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),

MachineMemOperand::MOStore,

- /*Size=*/16, /*Align=*/16);

+ /*Size=*/16, Align(16));

BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))

.addFrameIndex(RegSaveFrameIndex)

.addImm(/*Scale=*/1)

@@ -30694,11 +31425,13 @@ static bool isCMOVPseudo(MachineInstr &MI) {

case X86::CMOV_RFP32:

case X86::CMOV_RFP64:

case X86::CMOV_RFP80:

+ case X86::CMOV_VR64:

case X86::CMOV_VR128:

case X86::CMOV_VR128X:

case X86::CMOV_VR256:

case X86::CMOV_VR256X:

case X86::CMOV_VR512:

+ case X86::CMOV_VK1:

case X86::CMOV_VK2:

case X86::CMOV_VK4:

case X86::CMOV_VK8:

@@ -30995,8 +31728,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

(NextMIIt->getOperand(3).getImm() == CC ||

NextMIIt->getOperand(3).getImm() == OppCC)) {

LastCMOV = &*NextMIIt;

- ++NextMIIt;

- NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());

+ NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

}

@@ -31068,6 +31800,112 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

return SinkMBB;

}

+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

+ if (IsLP64) {

+ if (isInt<8>(Imm))

+ return X86::SUB64ri8;

+ return X86::SUB64ri32;

+ } else {

+ if (isInt<8>(Imm))

+ return X86::SUB32ri8;

+ return X86::SUB32ri;

+ }

+MachineBasicBlock *

+X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

+ MachineBasicBlock *MBB) const {

+ MachineFunction *MF = MBB->getParent();

+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();

+ const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

+ DebugLoc DL = MI.getDebugLoc();

+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();

+ const unsigned ProbeSize = getStackProbeSize(*MF);

+ MachineRegisterInfo &MRI = MF->getRegInfo();

+ MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

+ MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

+ MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

+ MachineFunction::iterator MBBIter = ++MBB->getIterator();

+ MF->insert(MBBIter, testMBB);

+ MF->insert(MBBIter, blockMBB);

+ MF->insert(MBBIter, tailMBB);

+ Register sizeVReg = MI.getOperand(1).getReg();

+ Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

+ Register TmpStackPtr = MRI.createVirtualRegister(

+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

+ Register FinalStackPtr = MRI.createVirtualRegister(

+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

+ BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

+ .addReg(physSPReg);

+ {

+ const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

+ BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

+ .addReg(TmpStackPtr)

+ .addReg(sizeVReg);

+ }

+ // test rsp size

+ BuildMI(testMBB, DL,

+ TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

+ .addReg(FinalStackPtr)

+ .addReg(physSPReg);

+ BuildMI(testMBB, DL, TII->get(X86::JCC_1))

+ .addMBB(tailMBB)

+ .addImm(X86::COND_L);

+ testMBB->addSuccessor(blockMBB);

+ testMBB->addSuccessor(tailMBB);

+ // Touch the block then extend it. This is done on the opposite side of

+ // static probe where we allocate then touch, to avoid the need of probing the

+ // tail of the static alloca. Possible scenarios are:

+ //

+ // + ---- <- ------------ <- ------------- <- ------------ +

+ // | |

+ // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

+ // | |

+ // + <- ----------- <- ------------ <- ----------- <- ------------ +

+ //

+ // The property we want to enforce is to never have more than [page alloc] between two probes.

+ const unsigned MovMIOpc =

+ TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;

+ addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)

+ .addImm(0);

+ BuildMI(blockMBB, DL,

+ TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

+ .addReg(physSPReg)

+ .addImm(ProbeSize);

+ BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

+ blockMBB->addSuccessor(testMBB);

+ // Replace original instruction by the expected stack ptr

+ BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

+ .addReg(FinalStackPtr);

+ tailMBB->splice(tailMBB->end(), MBB,

+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());

+ tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

+ MBB->addSuccessor(testMBB);

+ // Delete the original pseudo instruction.

+ MI.eraseFromParent();

+ // And we're done.

+ return tailMBB;

MachineBasicBlock *

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

MachineBasicBlock *BB) const {

@@ -31228,29 +32066,16 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

BB->addSuccessor(RestoreMBB);

MI.getOperand(0).setMBB(RestoreMBB);

+ // Marking this as an EH pad but not a funclet entry block causes PEI to

+ // restore stack pointers in the block.

+ RestoreMBB->setIsEHPad(true);

auto RestoreMBBI = RestoreMBB->begin();

- BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

return BB;

}

MachineBasicBlock *

-X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,

- MachineBasicBlock *BB) const {

- MachineFunction *MF = BB->getParent();

- const Constant *PerFn = MF->getFunction().getPersonalityFn();

- bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));

- // Only 32-bit SEH requires special handling for catchpad.

- if (IsSEH && Subtarget.is32Bit()) {

- const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

- DebugLoc DL = MI.getDebugLoc();

- BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));

- }

- MI.eraseFromParent();

- return BB;

-MachineBasicBlock *

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

MachineBasicBlock *BB) const {

// So, here we replace TLSADDR with the sequence:

@@ -31342,22 +32167,22 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

return BB;

}

-static unsigned getOpcodeForRetpoline(unsigned RPOpc) {

+static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

switch (RPOpc) {

- case X86::RETPOLINE_CALL32:

+ case X86::INDIRECT_THUNK_CALL32:

return X86::CALLpcrel32;

- case X86::RETPOLINE_CALL64:

+ case X86::INDIRECT_THUNK_CALL64:

return X86::CALL64pcrel32;

- case X86::RETPOLINE_TCRETURN32:

+ case X86::INDIRECT_THUNK_TCRETURN32:

return X86::TCRETURNdi;

- case X86::RETPOLINE_TCRETURN64:

+ case X86::INDIRECT_THUNK_TCRETURN64:

return X86::TCRETURNdi64;

}

- llvm_unreachable("not retpoline opcode");

+ llvm_unreachable("not indirect thunk opcode");

}

-static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,

- unsigned Reg) {

+static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

+ unsigned Reg) {

if (Subtarget.useRetpolineExternalThunk()) {

// When using an external thunk for retpolines, we pick names that match the

// names GCC happens to use as well. This helps simplify the implementation

@@ -31389,39 +32214,48 @@ static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");

return "__x86_indirect_thunk_r11";

}

+ llvm_unreachable("unexpected reg for external indirect thunk");

+ }

+ if (Subtarget.useRetpolineIndirectCalls() ||

+ Subtarget.useRetpolineIndirectBranches()) {

+ // When targeting an internal COMDAT thunk use an LLVM-specific name.

+ switch (Reg) {

+ case X86::EAX:

+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

+ return "__llvm_retpoline_eax";

+ case X86::ECX:

+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

+ return "__llvm_retpoline_ecx";

+ case X86::EDX:

+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

+ return "__llvm_retpoline_edx";

+ case X86::EDI:

+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

+ return "__llvm_retpoline_edi";

+ case X86::R11:

+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");

+ return "__llvm_retpoline_r11";

+ }

llvm_unreachable("unexpected reg for retpoline");

}

- // When targeting an internal COMDAT thunk use an LLVM-specific name.

- switch (Reg) {

- case X86::EAX:

- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

- return "__llvm_retpoline_eax";

- case X86::ECX:

- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

- return "__llvm_retpoline_ecx";

- case X86::EDX:

- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

- return "__llvm_retpoline_edx";

- case X86::EDI:

- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

- return "__llvm_retpoline_edi";

- case X86::R11:

+ if (Subtarget.useLVIControlFlowIntegrity()) {

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");

- return "__llvm_retpoline_r11";

+ return "__llvm_lvi_thunk_r11";

}

- llvm_unreachable("unexpected reg for retpoline");

+ llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");

}

MachineBasicBlock *

-X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,

- MachineBasicBlock *BB) const {

+X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

+ MachineBasicBlock *BB) const {

// Copy the virtual register into the R11 physical register and

// call the retpoline thunk.

DebugLoc DL = MI.getDebugLoc();

const X86InstrInfo *TII = Subtarget.getInstrInfo();

- unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());

+ unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

// Find an available scratch register to hold the callee. On 64-bit, we can

// just use R11, but we scan for uses anyway to ensure we don't generate

@@ -31455,7 +32289,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,

report_fatal_error("calling convention incompatible with retpoline, no "

"available registers");

- const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);

+ const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

.addReg(CalleeVReg);

@@ -31743,12 +32577,17 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

MBB->addSuccessor(checkSspMBB);

// Initialize a register with zero.

- Register ZReg = MRI.createVirtualRegister(PtrRC);

- unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

- BuildMI(checkSspMBB, DL, TII->get(XorRROpc))

- .addDef(ZReg)

- .addReg(ZReg, RegState::Undef)

- .addReg(ZReg, RegState::Undef);

+ Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

+ BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

+ if (PVT == MVT::i64) {

+ Register TmpZReg = MRI.createVirtualRegister(PtrRC);

+ BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

+ .addImm(0)

+ .addReg(ZReg)

+ .addImm(X86::sub_32bit);

+ ZReg = TmpZReg;

+ }

// Read the current SSP Register value to the zeroed register.

@@ -31877,7 +32716,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

// Since FP is only updated here but NOT referenced, it's treated as GPR.

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

- unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

+ Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

MachineInstrBuilder MIB;

@@ -32224,6 +33063,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

DebugLoc DL = MI.getDebugLoc();

+ auto TMMImmToTMMReg = [](unsigned Imm) {

+ assert (Imm < 8 && "Illegal tmm index");

+ return X86::TMM0 + Imm;

+ };

switch (MI.getOpcode()) {

default: llvm_unreachable("Unexpected instr type to insert");

case X86::TLS_addr32:

@@ -32231,18 +33074,19 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

case X86::TLS_base_addr32:

case X86::TLS_base_addr64:

return EmitLoweredTLSAddr(MI, BB);

- case X86::RETPOLINE_CALL32:

- case X86::RETPOLINE_CALL64:

- case X86::RETPOLINE_TCRETURN32:

- case X86::RETPOLINE_TCRETURN64:

- return EmitLoweredRetpoline(MI, BB);

+ case X86::INDIRECT_THUNK_CALL32:

+ case X86::INDIRECT_THUNK_CALL64:

+ case X86::INDIRECT_THUNK_TCRETURN32:

+ case X86::INDIRECT_THUNK_TCRETURN64:

+ return EmitLoweredIndirectThunk(MI, BB);

case X86::CATCHRET:

return EmitLoweredCatchRet(MI, BB);

- case X86::CATCHPAD:

- return EmitLoweredCatchPad(MI, BB);

case X86::SEG_ALLOCA_32:

case X86::SEG_ALLOCA_64:

return EmitLoweredSegAlloca(MI, BB);

+ case X86::PROBED_ALLOCA_32:

+ case X86::PROBED_ALLOCA_64:

+ return EmitLoweredProbedAlloca(MI, BB);

case X86::TLSCall_32:

case X86::TLSCall_64:

return EmitLoweredTLSCall(MI, BB);

@@ -32256,11 +33100,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

case X86::CMOV_RFP32:

case X86::CMOV_RFP64:

case X86::CMOV_RFP80:

+ case X86::CMOV_VR64:

case X86::CMOV_VR128:

case X86::CMOV_VR128X:

case X86::CMOV_VR256:

case X86::CMOV_VR256X:

case X86::CMOV_VR512:

+ case X86::CMOV_VK1:

case X86::CMOV_VK2:

case X86::CMOV_VK4:

case X86::CMOV_VK8:

@@ -32315,7 +33161,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

case X86::FP80_TO_INT64_IN_MEM: {

// Change the floating point control register to use "round towards zero"

// mode when truncating to an integer value.

- int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);

+ int OrigCWFrameIdx =

+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);

addFrameReference(BuildMI(*BB, MI, DL,

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

@@ -32336,7 +33183,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

// Prepare memory for FLDCW.

- int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);

+ int NewCWFrameIdx =

+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

NewCWFrameIdx)

.addReg(NewCW16, RegState::Kill);

@@ -32471,6 +33319,97 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

BB->addLiveIn(BasePtr);

return BB;

}

+ case TargetOpcode::PREALLOCATED_SETUP: {

+ assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");

+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();

+ MFI->setHasPreallocatedCall(true);

+ int64_t PreallocatedId = MI.getOperand(0).getImm();

+ size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

+ assert(StackAdjustment != 0 && "0 stack adjustment");

+ LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "

+ << StackAdjustment << "\n");

+ BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

+ .addReg(X86::ESP)

+ .addImm(StackAdjustment);

+ MI.eraseFromParent();

+ return BB;

+ }

+ case TargetOpcode::PREALLOCATED_ARG: {

+ assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");

+ int64_t PreallocatedId = MI.getOperand(1).getImm();

+ int64_t ArgIdx = MI.getOperand(2).getImm();

+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();

+ size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

+ LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx

+ << ", arg offset " << ArgOffset << "\n");

+ // stack pointer + offset

+ addRegOffset(

+ BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

+ X86::ESP, false, ArgOffset);

+ MI.eraseFromParent();

+ return BB;

+ }

+ case X86::PTDPBSSD:

+ case X86::PTDPBSUD:

+ case X86::PTDPBUSD:

+ case X86::PTDPBUUD:

+ case X86::PTDPBF16PS: {

+ const DebugLoc &DL = MI.getDebugLoc();

+ unsigned Opc;

+ switch (MI.getOpcode()) {

+ case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

+ case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

+ case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

+ case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

+ case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

+ }

+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

+ MI.eraseFromParent(); // The pseudo is gone now.

+ return BB;

+ }

+ case X86::PTILEZERO: {

+ const DebugLoc &DL = MI.getDebugLoc();

+ unsigned Imm = MI.getOperand(0).getImm();

+ BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

+ MI.eraseFromParent(); // The pseudo is gone now.

+ return BB;

+ }

+ case X86::PTILELOADD:

+ case X86::PTILELOADDT1:

+ case X86::PTILESTORED: {

+ const DebugLoc &DL = MI.getDebugLoc();

+ unsigned Opc;

+ switch (MI.getOpcode()) {

+ case X86::PTILELOADD: Opc = X86::TILELOADD; break;

+ case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

+ case X86::PTILESTORED: Opc = X86::TILESTORED; break;

+ }

+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

+ unsigned CurOp = 0;

+ if (Opc != X86::TILESTORED)

+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

+ RegState::Define);

+ MIB.add(MI.getOperand(CurOp++)); // base

+ MIB.add(MI.getOperand(CurOp++)); // scale

+ MIB.add(MI.getOperand(CurOp++)); // index -- stride

+ MIB.add(MI.getOperand(CurOp++)); // displacement

+ MIB.add(MI.getOperand(CurOp++)); // segment

+ if (Opc == X86::TILESTORED)

+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

+ RegState::Undef);

+ MI.eraseFromParent(); // The pseudo is gone now.

+ return BB;

+ }

}

@@ -32480,20 +33419,53 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

bool

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

- const APInt &Demanded,

+ const APInt &DemandedBits,

+ const APInt &DemandedElts,

TargetLoweringOpt &TLO) const {

- // Only optimize Ands to prevent shrinking a constant that could be

- // matched by movzx.

- if (Op.getOpcode() != ISD::AND)

- return false;

EVT VT = Op.getValueType();

+ unsigned Opcode = Op.getOpcode();

+ unsigned EltSize = VT.getScalarSizeInBits();

- // Ignore vectors.

- if (VT.isVector())

+ if (VT.isVector()) {

+ // If the constant is only all signbits in the active bits, then we should

+ // extend it to the entire constant to allow it act as a boolean constant

+ // vector.

+ auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

+ if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

+ return false;

+ for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

+ if (!DemandedElts[i] || V.getOperand(i).isUndef())

+ continue;

+ const APInt &Val = V.getConstantOperandAPInt(i);

+ if (Val.getBitWidth() > Val.getNumSignBits() &&

+ Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

+ return true;

+ }

+ return false;

+ };

+ // For vectors - if we have a constant, then try to sign extend.

+ // TODO: Handle AND/ANDN cases.

+ unsigned ActiveBits = DemandedBits.getActiveBits();

+ if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

+ (Opcode == ISD::OR || Opcode == ISD::XOR) &&

+ NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

+ EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

+ EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

+ VT.getVectorNumElements());

+ SDValue NewC =

+ TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

+ Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

+ SDValue NewOp =

+ TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

+ return TLO.CombineTo(Op, NewOp);

+ }

return false;

+ }

- unsigned Size = VT.getSizeInBits();

+ // Only optimize Ands to prevent shrinking a constant that could be

+ // matched by movzx.

+ if (Opcode != ISD::AND)

+ return false;

// Make sure the RHS really is a constant.

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

@@ -32503,7 +33475,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

const APInt &Mask = C->getAPIntValue();

// Clear all non-demanded bits initially.

- APInt ShrunkMask = Mask & Demanded;

+ APInt ShrunkMask = Mask & DemandedBits;

// Find the width of the shrunk mask.

unsigned Width = ShrunkMask.getActiveBits();

@@ -32515,10 +33487,10 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

// Find the next power of 2 width, rounding up to a byte.

Width = PowerOf2Ceil(std::max(Width, 8U));

// Truncate the width to size to handle illegal types.

- Width = std::min(Width, Size);

+ Width = std::min(Width, EltSize);

// Calculate a possible zero extend mask for this constant.

- APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);

+ APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

// If we aren't changing the mask, just return true to keep it and prevent

// the caller from optimizing.

@@ -32527,7 +33499,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

// Make sure the new mask can be represented by a combination of mask bits

// and non-demanded bits.

- if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))

+ if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

return false;

// Replace the constant with the zero extend mask.

@@ -32543,6 +33515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

const SelectionDAG &DAG,

unsigned Depth) const {

unsigned BitWidth = Known.getBitWidth();

+ unsigned NumElts = DemandedElts.getBitWidth();

unsigned Opc = Op.getOpcode();

EVT VT = Op.getValueType();

assert((Opc >= ISD::BUILTIN_OP_END ||

@@ -32570,7 +33543,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

Op.getConstantOperandVal(1));

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

- Known = Known.zextOrTrunc(BitWidth, false);

+ Known = Known.anyextOrTrunc(BitWidth);

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

break;

}

@@ -32640,10 +33613,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

- // Output known-0 bits are only known if clear in both the LHS & RHS.

- Known.Zero &= Known2.Zero;

- // Output known-1 are known to be set if set in either the LHS | RHS.

- Known.One |= Known2.One;

+ Known |= Known2;

break;

}

case X86ISD::PSADBW: {

@@ -32667,6 +33637,76 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

Known.Zero &= Known2.Zero;

break;

}

+ case X86ISD::BEXTR: {

+ SDValue Op0 = Op.getOperand(0);

+ SDValue Op1 = Op.getOperand(1);

+ if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

+ // If the length is 0, the result is 0.

+ if (Length == 0) {

+ Known.setAllZero();

+ break;

+ }

+ if ((Shift + Length) <= BitWidth) {

+ Known = DAG.computeKnownBits(Op0, Depth + 1);

+ Known = Known.extractBits(Length, Shift);

+ Known = Known.zextOrTrunc(BitWidth);

+ }

+ break;

+ }

+ case X86ISD::CVTSI2P:

+ case X86ISD::CVTUI2P:

+ case X86ISD::CVTP2SI:

+ case X86ISD::CVTP2UI:

+ case X86ISD::MCVTP2SI:

+ case X86ISD::MCVTP2UI:

+ case X86ISD::CVTTP2SI:

+ case X86ISD::CVTTP2UI:

+ case X86ISD::MCVTTP2SI:

+ case X86ISD::MCVTTP2UI:

+ case X86ISD::MCVTSI2P:

+ case X86ISD::MCVTUI2P:

+ case X86ISD::VFPROUND:

+ case X86ISD::VMFPROUND:

+ case X86ISD::CVTPS2PH:

+ case X86ISD::MCVTPS2PH: {

+ // Conversions - upper elements are known zero.

+ EVT SrcVT = Op.getOperand(0).getValueType();

+ if (SrcVT.isVector()) {

+ unsigned NumSrcElts = SrcVT.getVectorNumElements();

+ if (NumElts > NumSrcElts &&

+ DemandedElts.countTrailingZeros() >= NumSrcElts)

+ Known.setAllZero();

+ }

+ break;

+ }

+ case X86ISD::STRICT_CVTTP2SI:

+ case X86ISD::STRICT_CVTTP2UI:

+ case X86ISD::STRICT_CVTSI2P:

+ case X86ISD::STRICT_CVTUI2P:

+ case X86ISD::STRICT_VFPROUND:

+ case X86ISD::STRICT_CVTPS2PH: {

+ // Strict Conversions - upper elements are known zero.

+ EVT SrcVT = Op.getOperand(1).getValueType();

+ if (SrcVT.isVector()) {

+ unsigned NumSrcElts = SrcVT.getVectorNumElements();

+ if (NumElts > NumSrcElts &&

+ DemandedElts.countTrailingZeros() >= NumSrcElts)

+ Known.setAllZero();

+ }

+ break;

+ }

+ case X86ISD::MOVQ2DQ: {

+ // Move from MMX to XMM. Upper half of XMM should be 0.

+ if (DemandedElts.countTrailingZeros() >= (NumElts / 2))

+ Known.setAllZero();

+ break;

+ }

}

// Handle target shuffles.

@@ -32733,11 +33773,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

return VTBits;

case X86ISD::VTRUNC: {

- // TODO: Add DemandedElts support.

SDValue Src = Op.getOperand(0);

- unsigned NumSrcBits = Src.getScalarValueSizeInBits();

+ MVT SrcVT = Src.getSimpleValueType();

+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

assert(VTBits < NumSrcBits && "Illegal truncation input type");

- unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);

+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

if (Tmp > (NumSrcBits - VTBits))

return Tmp - (NumSrcBits - VTBits);

return 1;

@@ -32865,6 +33906,21 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

return N;

}

+// Helper to look for a normal load that can be narrowed into a vzload with the

+// specified VT and memory VT. Returns SDValue() on failure.

+static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

+ SelectionDAG &DAG) {

+ // Can't if the load is volatile or atomic.

+ if (!LN->isSimple())

+ return SDValue();

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

+ return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

+ LN->getPointerInfo(), LN->getOriginalAlign(),

+ LN->getMemOperand()->getFlags());

// Attempt to match a combined shuffle mask against supported unary shuffle

// instructions.

// TODO: Investigate sharing more of this with shuffle lowering.

@@ -33009,9 +34065,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

unsigned InputSizeInBits = MaskVT.getSizeInBits();

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

- bool ContainsZeros =

- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

+ bool ContainsZeros = isAnyZero(Mask);

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

@@ -33059,7 +34113,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

// Narrow the repeated mask to create 32-bit element permutes.

SmallVector<int, 4> WordMask = RepeatedMask;

if (MaskScalarSizeInBits == 64)

- scaleShuffleMask<int>(2, RepeatedMask, WordMask);

+ narrowShuffleMaskElts(2, RepeatedMask, WordMask);

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

@@ -33102,17 +34156,32 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

}

// Attempt to match against byte/bit shifts.

- // FIXME: Add 512-bit support.

- if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {

+ if (AllowIntDomain &&

+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,

Mask, 0, Zeroable, Subtarget);

- if (0 < ShiftAmt) {

+ if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

+ 32 <= ShuffleVT.getScalarSizeInBits())) {

PermuteImm = (unsigned)ShiftAmt;

return true;

}

+ // Attempt to match against bit rotates.

+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

+ ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

+ Subtarget.hasAVX512())) {

+ int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

+ Subtarget, Mask);

+ if (0 < RotateAmt) {

+ Shuffle = X86ISD::VROTLI;

+ PermuteImm = (unsigned)RotateAmt;

+ return true;

+ }

return false;

}

@@ -33193,9 +34262,29 @@ static bool matchBinaryPermuteShuffle(

unsigned NumMaskElts = Mask.size();

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

+ // Attempt to match against VALIGND/VALIGNQ rotate.

+ if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

+ ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

+ (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

+ if (!isAnyZero(Mask)) {

+ int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

+ if (0 < Rotation) {

+ Shuffle = X86ISD::VALIGN;

+ if (EltSizeInBits == 64)

+ ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

+ else

+ ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

+ PermuteImm = Rotation;

+ return true;

+ }

// Attempt to match against PALIGNR byte rotate.

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {

+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

if (0 < ByteRotation) {

Shuffle = X86ISD::PALIGNR;

@@ -33245,8 +34334,7 @@ static bool matchBinaryPermuteShuffle(

// Attempt to combine to INSERTPS, but only if it has elements that need to

// be set to zero.

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

- MaskVT.is128BitVector() &&

- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&

+ MaskVT.is128BitVector() && isAnyZero(Mask) &&

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

Shuffle = X86ISD::INSERTPS;

ShuffleVT = MVT::v4f32;

@@ -33374,6 +34462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

return DAG.getBitcast(RootVT, V1);

}

+ bool OptForSize = DAG.shouldOptForSize();

unsigned RootSizeInBits = RootVT.getSizeInBits();

unsigned NumRootElts = RootVT.getVectorNumElements();

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

@@ -33384,11 +34473,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

// Don't combine if we are a AVX512/EVEX target and the mask element size

// is different from the root element size - this would prevent writemasks

// from being reused.

- // TODO - this currently prevents all lane shuffles from occurring.

- // TODO - check for writemasks usage instead of always preventing combining.

- // TODO - attempt to narrow Mask back to writemask size.

- bool IsEVEXShuffle =

- RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);

+ bool IsMaskedShuffle = false;

+ if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

+ if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

+ Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

+ IsMaskedShuffle = true;

+ }

+ // If we are shuffling a broadcast (and not introducing zeros) then

+ // we can just use the broadcast directly. This works for smaller broadcast

+ // elements as well as they already repeat across each mask element

+ if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&

+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {

+ return DAG.getBitcast(RootVT, V1);

+ }

// Attempt to match a subvector broadcast.

// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)

@@ -33408,27 +34507,138 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

}

- // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

+ // Handle 128/256-bit lane shuffles of 512-bit vectors.

+ if (RootVT.is512BitVector() &&

+ (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

+ MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

+ // If the upper subvectors are zeroable, then an extract+insert is more

+ // optimal than using X86ISD::SHUF128. The insertion is free, even if it has

+ // to zero the upper subvectors.

+ if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {

+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

+ return SDValue(); // Nothing to do!

+ assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&

+ "Unexpected lane shuffle");

+ Res = DAG.getBitcast(ShuffleVT, V1);

+ unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);

+ bool UseZero = isAnyZero(BaseMask);

+ Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

+ Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

+ return DAG.getBitcast(RootVT, Res);

+ }

+ // Narrow shuffle mask to v4x128.

+ SmallVector<int, 4> Mask;

+ assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");

+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);

+ // Try to lower to vshuf64x2/vshuf32x4.

+ auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,

+ SDValue V1, SDValue V2, SelectionDAG &DAG) {

+ unsigned PermMask = 0;

+ // Insure elements came from the same Op.

+ SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

+ for (int i = 0; i < 4; ++i) {

+ assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");

+ if (Mask[i] < 0)

+ continue;

+ SDValue Op = Mask[i] >= 4 ? V2 : V1;

+ unsigned OpIndex = i / 2;

+ if (Ops[OpIndex].isUndef())

+ Ops[OpIndex] = Op;

+ else if (Ops[OpIndex] != Op)

+ return SDValue();

+ // Convert the 128-bit shuffle mask selection values into 128-bit

+ // selection bits defined by a vshuf64x2 instruction's immediate control

+ // byte.

+ PermMask |= (Mask[i] % 4) << (i * 2);

+ }

+ return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

+ DAG.getBitcast(ShuffleVT, Ops[0]),

+ DAG.getBitcast(ShuffleVT, Ops[1]),

+ DAG.getTargetConstant(PermMask, DL, MVT::i8));

+ };

+ // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

+ // doesn't work because our mask is for 128 bits and we don't have an MVT

+ // to match that.

+ bool PreferPERMQ =

+ UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&

+ isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&

+ isUndefOrInRange(Mask[3], 2, 4) &&

+ (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&

+ (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));

+ if (!isAnyZero(Mask) && !PreferPERMQ) {

+ if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))

+ return DAG.getBitcast(RootVT, V);

+ }

// Handle 128-bit lane shuffles of 256-bit vectors.

- // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

- // we need to use the zeroing feature.

- // TODO - this should support binary shuffles.

- if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&

- !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&

- !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {

+ if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

+ MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);

+ // If the upper half is zeroable, then an extract+insert is more optimal

+ // than using X86ISD::VPERM2X128. The insertion is free, even if it has to

+ // zero the upper half.

+ if (isUndefOrZero(BaseMask[1])) {

+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

+ return SDValue(); // Nothing to do!

+ assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");

+ Res = DAG.getBitcast(ShuffleVT, V1);

+ Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);

+ Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,

+ DL, 256);

+ return DAG.getBitcast(RootVT, Res);

+ }

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

return SDValue(); // Nothing to do!

- MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);

- unsigned PermMask = 0;

- PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);

- PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

- Res = DAG.getBitcast(ShuffleVT, V1);

- Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,

- DAG.getUNDEF(ShuffleVT),

- DAG.getTargetConstant(PermMask, DL, MVT::i8));

- return DAG.getBitcast(RootVT, Res);

+ // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

+ // we need to use the zeroing feature.

+ // Prefer blends for sequential shuffles unless we are optimizing for size.

+ if (UnaryShuffle &&

+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&

+ (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {

+ unsigned PermMask = 0;

+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);

+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

+ Res = DAG.getBitcast(ShuffleVT, V1);

+ Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,

+ DAG.getUNDEF(ShuffleVT),

+ DAG.getTargetConstant(PermMask, DL, MVT::i8));

+ return DAG.getBitcast(RootVT, Res);

+ }

+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

+ return SDValue(); // Nothing to do!

+ // TODO - handle AVX512VL cases with X86ISD::SHUF128.

+ if (!UnaryShuffle && !IsMaskedShuffle) {

+ assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&

+ "Unexpected shuffle sentinel value");

+ // Prefer blends to X86ISD::VPERM2X128.

+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||

+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {

+ unsigned PermMask = 0;

+ PermMask |= ((BaseMask[0] & 3) << 0);

+ PermMask |= ((BaseMask[1] & 3) << 4);

+ Res = DAG.getNode(

+ X86ISD::VPERM2X128, DL, ShuffleVT,

+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),

+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),

+ DAG.getTargetConstant(PermMask, DL, MVT::i8));

+ return DAG.getBitcast(RootVT, Res);

+ }

}

// For masks that have been widened to 128-bit elements or more,

@@ -33437,9 +34647,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

if (BaseMaskEltSizeInBits > 64) {

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");

int MaskScale = BaseMaskEltSizeInBits / 64;

- scaleShuffleMask<int>(MaskScale, BaseMask, Mask);

+ narrowShuffleMaskElts(MaskScale, BaseMask, Mask);

} else {

- Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());

+ Mask.assign(BaseMask.begin(), BaseMask.end());

+ }

+ // For masked shuffles, we're trying to match the root width for better

+ // writemask folding, attempt to scale the mask.

+ // TODO - variable shuffles might need this to be widened again.

+ if (IsMaskedShuffle && NumRootElts > Mask.size()) {

+ assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");

+ int MaskScale = NumRootElts / Mask.size();

+ SmallVector<int, 64> ScaledMask;

+ narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

+ Mask = std::move(ScaledMask);

}

unsigned NumMaskElts = Mask.size();

@@ -33472,26 +34693,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

APInt Zeroable = KnownUndef | KnownZero;

if (UnaryShuffle) {

- // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load

- // directly if we don't shuffle the lower element and we shuffle the upper

- // (zero) elements within themselves.

- if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&

- (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %

- MaskEltSizeInBits) == 0) {

- unsigned Scale =

- cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /

- MaskEltSizeInBits;

- ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);

- if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&

- isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {

- return DAG.getBitcast(RootVT, V1);

- }

// Attempt to match against broadcast-from-vector.

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

- if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))

- && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {

+ if ((Subtarget.hasAVX2() ||

+ (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

+ (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);

if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {

if (V1.getValueType() == MaskVT &&

@@ -33517,7 +34723,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

ShuffleVT) &&

- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

+ (!IsMaskedShuffle ||

+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {

if (Depth == 0 && Root.getOpcode() == Shuffle)

return SDValue(); // Nothing to do!

Res = DAG.getBitcast(ShuffleSrcVT, NewV1);

@@ -33528,7 +34735,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

AllowIntDomain, Subtarget, Shuffle, ShuffleVT,

PermuteImm) &&

- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

+ (!IsMaskedShuffle ||

+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {

if (Depth == 0 && Root.getOpcode() == Shuffle)

return SDValue(); // Nothing to do!

Res = DAG.getBitcast(ShuffleVT, V1);

@@ -33538,12 +34746,31 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

}

+ // Attempt to combine to INSERTPS, but only if the inserted element has come

+ // from a scalar.

+ // TODO: Handle other insertions here as well?

+ if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

+ MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&

+ !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {

+ SDValue SrcV1 = V1, SrcV2 = V2;

+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&

+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

+ return SDValue(); // Nothing to do!

+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

+ DAG.getBitcast(MVT::v4f32, SrcV1),

+ DAG.getBitcast(MVT::v4f32, SrcV2),

+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

+ return DAG.getBitcast(RootVT, Res);

+ }

SDValue NewV1 = V1; // Save operands in case early exit happens.

SDValue NewV2 = V2;

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

ShuffleVT, UnaryShuffle) &&

- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

if (Depth == 0 && Root.getOpcode() == Shuffle)

return SDValue(); // Nothing to do!

NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);

@@ -33554,10 +34781,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

NewV1 = V1; // Save operands in case early exit happens.

NewV2 = V2;

- if (matchBinaryPermuteShuffle(

- MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,

- NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

+ if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

+ AllowIntDomain, NewV1, NewV2, DL, DAG,

+ Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

if (Depth == 0 && Root.getOpcode() == Shuffle)

return SDValue(); // Nothing to do!

NewV1 = DAG.getBitcast(ShuffleVT, NewV1);

@@ -33597,6 +34824,44 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

}

+ // Match shuffle against TRUNCATE patterns.

+ if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

+ // Match against a VTRUNC instruction, accounting for src/dst sizes.

+ if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

+ Subtarget)) {

+ bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

+ ShuffleSrcVT.getVectorNumElements();

+ unsigned Opc =

+ IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

+ if (Depth == 0 && Root.getOpcode() == Opc)

+ return SDValue(); // Nothing to do!

+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);

+ Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

+ if (ShuffleVT.getSizeInBits() < RootSizeInBits)

+ Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

+ return DAG.getBitcast(RootVT, Res);

+ }

+ // Do we need a more general binary truncation pattern?

+ if (RootSizeInBits < 512 &&

+ ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

+ (RootVT.is128BitVector() && Subtarget.hasVLX())) &&

+ (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

+ isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

+ if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)

+ return SDValue(); // Nothing to do!

+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);

+ V2 = DAG.getBitcast(ShuffleSrcVT, V2);

+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

+ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

+ Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

+ return DAG.getBitcast(RootVT, Res);

+ }

// Don't try to re-form single instruction chains under any circumstances now

// that we've done encoding canonicalization for them.

if (Depth < 1)

@@ -33606,8 +34871,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;

AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;

- bool MaskContainsZeros =

- any_of(Mask, [](int M) { return M == SM_SentinelZero; });

+ bool MaskContainsZeros = isAnyZero(Mask);

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

// If we have a single input lane-crossing shuffle then lower to VPERMV.

@@ -33702,7 +34966,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

Res = DAG.getBitcast(MaskVT, V1);

unsigned AndOpcode =

- FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

+ MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

return DAG.getBitcast(RootVT, Res);

}

@@ -33779,7 +35043,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

continue;

}

if (M == SM_SentinelZero) {

- PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));

+ PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

continue;

}

M = Ratio * M + i % Ratio;

@@ -33810,7 +35074,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

continue;

}

if (M == SM_SentinelZero) {

- VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));

+ VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

continue;

}

M = Ratio * M + i % Ratio;

@@ -33885,8 +35149,7 @@ static SDValue combineX86ShuffleChainWithExtract(

unsigned &Offset = Offsets[i];

Src = peekThroughBitcasts(Src);

EVT BaseVT = Src.getValueType();

- while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

- isa<ConstantSDNode>(Src.getOperand(1))) {

+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

Offset += Src.getConstantOperandVal(1);

Src = Src.getOperand(0);

}

@@ -33998,6 +35261,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

return SDValue();

// Shuffle the constant bits according to the mask.

+ SDLoc DL(Root);

APInt UndefElts(NumMaskElts, 0);

APInt ZeroElts(NumMaskElts, 0);

APInt ConstantElts(NumMaskElts, 0);

@@ -34035,6 +35299,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

}

assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());

+ // Attempt to create a zero vector.

+ if ((UndefElts | ZeroElts).isAllOnesValue())

+ return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

// Create the constant data.

MVT MaskSVT;

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

@@ -34043,8 +35311,9 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

+ if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

+ return SDValue();

- SDLoc DL(Root);

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

return DAG.getBitcast(VT, CstOp);

}

@@ -34103,7 +35372,8 @@ static SDValue combineX86ShufflesRecursively(

assert(Root.getSimpleValueType().isVector() &&

"Shuffles operate on vector types!");

- assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&

+ unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();

+ assert(VT.getSizeInBits() == RootSizeInBits &&

"Can only combine shuffles of the same vector register size.");

// Extract target shuffle mask and resolve sentinels and inputs.

@@ -34117,6 +35387,18 @@ static SDValue combineX86ShufflesRecursively(

OpZero, DAG, Depth, false))

return SDValue();

+ // Shuffle inputs must be the same size as the result, bail on any larger

+ // inputs and widen any smaller inputs.

+ if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {

+ return Op.getValueSizeInBits() > RootSizeInBits;

+ }))

+ return SDValue();

+ for (SDValue &Op : OpInputs)

+ if (Op.getValueSizeInBits() < RootSizeInBits)

+ Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,

+ SDLoc(Op), RootSizeInBits);

SmallVector<int, 64> Mask;

SmallVector<SDValue, 16> Ops;

@@ -34517,6 +35799,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

return V;

}

+// Attempt to commute shufps LHS loads:

+// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

+static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

+ SelectionDAG &DAG) {

+ // TODO: Add vXf64 support.

+ if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

+ return SDValue();

+ // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

+ auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

+ if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

+ return SDValue();

+ SDValue N0 = V.getOperand(0);

+ SDValue N1 = V.getOperand(1);

+ unsigned Imm = V.getConstantOperandVal(2);

+ if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||

+ MayFoldLoad(peekThroughOneUseBitcasts(N1)))

+ return SDValue();

+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

+ DAG.getTargetConstant(Imm, DL, MVT::i8));

+ };

+ switch (N.getOpcode()) {

+ case X86ISD::VPERMILPI:

+ if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

+ unsigned Imm = N.getConstantOperandVal(1);

+ return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

+ }

+ break;

+ case X86ISD::SHUFP: {

+ SDValue N0 = N.getOperand(0);

+ SDValue N1 = N.getOperand(1);

+ unsigned Imm = N.getConstantOperandVal(2);

+ if (N0 == N1) {

+ if (SDValue NewSHUFP = commuteSHUFP(N, N0))

+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

+ DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

+ DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

+ }

+ break;

+ }

+ return SDValue();

/// Try to combine x86 target specific shuffles.

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

@@ -34526,35 +35861,105 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

SmallVector<int, 4> Mask;

unsigned Opcode = N.getOpcode();

+ bool IsUnary;

+ SmallVector<int, 64> TargetMask;

+ SmallVector<SDValue, 2> TargetOps;

+ if (isTargetShuffle(Opcode))

+ getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

- // single instruction.

- if (VT.getScalarSizeInBits() == 64 &&

- (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||

- Opcode == X86ISD::UNPCKL)) {

- auto BC0 = peekThroughBitcasts(N.getOperand(0));

- auto BC1 = peekThroughBitcasts(N.getOperand(1));

- EVT VT0 = BC0.getValueType();

- EVT VT1 = BC1.getValueType();

- unsigned Opcode0 = BC0.getOpcode();

- unsigned Opcode1 = BC1.getOpcode();

- if (Opcode0 == Opcode1 && VT0 == VT1 &&

- (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||

- Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {

- SDValue Lo, Hi;

- if (Opcode == X86ISD::MOVSD) {

- Lo = BC1.getOperand(0);

- Hi = BC0.getOperand(1);

- } else {

- Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);

- Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);

+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that

+ // represents the LHS/RHS inputs for the lower/upper halves.

+ SmallVector<int, 16> TargetMask128;

+ if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&

+ isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {

+ SmallVector<int, 16> WidenedMask128 = TargetMask128;

+ while (WidenedMask128.size() > 2) {

+ SmallVector<int, 16> WidenedMask;

+ if (!canWidenShuffleElements(WidenedMask128, WidenedMask))

+ break;

+ WidenedMask128 = std::move(WidenedMask);

+ }

+ if (WidenedMask128.size() == 2) {

+ assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");

+ SDValue BC0 = peekThroughBitcasts(TargetOps.front());

+ SDValue BC1 = peekThroughBitcasts(TargetOps.back());

+ EVT VT0 = BC0.getValueType();

+ EVT VT1 = BC1.getValueType();

+ unsigned Opcode0 = BC0.getOpcode();

+ unsigned Opcode1 = BC1.getOpcode();

+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

+ if (Opcode0 == Opcode1 && VT0 == VT1 &&

+ (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {

+ bool SingleOp = (TargetOps.size() == 1);

+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

+ SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;

+ SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;

+ Lo = Lo.getOperand(WidenedMask128[0] & 1);

+ Hi = Hi.getOperand(WidenedMask128[1] & 1);

+ if (SingleOp) {

+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

+ SDValue Undef = DAG.getUNDEF(SrcVT);

+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

+ Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);

+ Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);

+ Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);

+ Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);

+ }

+ SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

+ return DAG.getBitcast(VT, Horiz);

+ }

}

- SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

- return DAG.getBitcast(VT, Horiz);

}

+ if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

+ return R;

+ // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to

+ // help expose the 'NOT' pattern further up the DAG.

+ // TODO: This might be beneficial for any binop with a 'splattable' operand.

switch (Opcode) {

+ case X86ISD::MOVDDUP:

+ case X86ISD::PSHUFD: {

+ SDValue Src = N.getOperand(0);

+ if (Src.hasOneUse() && Src.getValueType() == VT) {

+ if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {

+ Not = DAG.getBitcast(VT, Not);

+ Not = Opcode == X86ISD::MOVDDUP

+ ? DAG.getNode(Opcode, DL, VT, Not)

+ : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));

+ EVT IntVT = Not.getValueType().changeTypeToInteger();

+ SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);

+ Not = DAG.getBitcast(IntVT, Not);

+ Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);

+ return DAG.getBitcast(VT, Not);

+ }

+ break;

+ }

+ // Handle specific target shuffles.

+ switch (Opcode) {

+ case X86ISD::MOVDDUP: {

+ SDValue Src = N.getOperand(0);

+ // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

+ if (VT == MVT::v2f64 && Src.hasOneUse() &&

+ ISD::isNormalLoad(Src.getNode())) {

+ LoadSDNode *LN = cast<LoadSDNode>(Src);

+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

+ SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

+ DCI.CombineTo(N.getNode(), Movddup);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

+ return N; // Return N so it doesn't get rechecked!

+ }

+ return SDValue();

+ }

case X86ISD::VBROADCAST: {

SDValue Src = N.getOperand(0);

SDValue BC = peekThroughBitcasts(Src);

@@ -34580,7 +35985,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

if (Src.getOpcode() == ISD::BITCAST &&

- SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {

+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

+ DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

VT.getVectorNumElements());

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

@@ -34627,6 +36033,190 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

return N; // Return N so it doesn't get rechecked!

}

+ // Due to isTypeDesirableForOp, we won't always shrink a load truncated to

+ // i16. So shrink it ourselves if we can make a broadcast_load.

+ if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

+ Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

+ assert(Subtarget.hasAVX2() && "Expected AVX2");

+ SDValue TruncIn = Src.getOperand(0);

+ // If this is a truncate of a non extending load we can just narrow it to

+ // use a broadcast_load.

+ if (ISD::isNormalLoad(TruncIn.getNode())) {

+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

+ // Unless its volatile or atomic.

+ if (LN->isSimple()) {

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

+ SDValue BcastLd = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

+ LN->getPointerInfo(), LN->getOriginalAlign(),

+ LN->getMemOperand()->getFlags());

+ DCI.CombineTo(N.getNode(), BcastLd);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());

+ return N; // Return N so it doesn't get rechecked!

+ }

+ // If this is a truncate of an i16 extload, we can directly replace it.

+ if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

+ ISD::isEXTLoad(Src.getOperand(0).getNode())) {

+ LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

+ if (LN->getMemoryVT().getSizeInBits() == 16) {

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

+ SDValue BcastLd =

+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

+ LN->getMemoryVT(), LN->getMemOperand());

+ DCI.CombineTo(N.getNode(), BcastLd);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());

+ return N; // Return N so it doesn't get rechecked!

+ }

+ // If this is a truncate of load that has been shifted right, we can

+ // offset the pointer and use a narrower load.

+ if (TruncIn.getOpcode() == ISD::SRL &&

+ TruncIn.getOperand(0).hasOneUse() &&

+ isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

+ ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

+ unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

+ // Make sure the shift amount and the load size are divisible by 16.

+ // Don't do this if the load is volatile or atomic.

+ if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

+ LN->isSimple()) {

+ unsigned Offset = ShiftAmt / 8;

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);

+ SDValue Ops[] = { LN->getChain(), Ptr };

+ SDValue BcastLd = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

+ LN->getPointerInfo().getWithOffset(Offset),

+ LN->getOriginalAlign(),

+ LN->getMemOperand()->getFlags());

+ DCI.CombineTo(N.getNode(), BcastLd);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());

+ return N; // Return N so it doesn't get rechecked!

+ }

+ // vbroadcast(vzload X) -> vbroadcast_load X

+ if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

+ MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

+ if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

+ SDValue BcastLd =

+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

+ LN->getMemoryVT(), LN->getMemOperand());

+ DCI.CombineTo(N.getNode(), BcastLd);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

+ return N; // Return N so it doesn't get rechecked!

+ }

+ // vbroadcast(vector load X) -> vbroadcast_load

+ if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&

+ ISD::isNormalLoad(Src.getNode())) {

+ LoadSDNode *LN = cast<LoadSDNode>(Src);

+ // Unless the load is volatile or atomic.

+ if (LN->isSimple()) {

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

+ SDValue BcastLd = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

+ LN->getPointerInfo(), LN->getOriginalAlign(),

+ LN->getMemOperand()->getFlags());

+ DCI.CombineTo(N.getNode(), BcastLd);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

+ return N; // Return N so it doesn't get rechecked!

+ }

+ return SDValue();

+ }

+ case X86ISD::VZEXT_MOVL: {

+ SDValue N0 = N.getOperand(0);

+ // If this a vzmovl of a full vector load, replace it with a vzload, unless

+ // the load is volatile.

+ if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

+ auto *LN = cast<LoadSDNode>(N0);

+ if (SDValue VZLoad =

+ narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

+ DCI.CombineTo(N.getNode(), VZLoad);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

+ return N;

+ }

+ // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

+ // and can just use a VZEXT_LOAD.

+ // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

+ if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

+ auto *LN = cast<MemSDNode>(N0);

+ if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

+ SDValue VZLoad =

+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

+ LN->getMemoryVT(), LN->getMemOperand());

+ DCI.CombineTo(N.getNode(), VZLoad);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

+ return N;

+ }

+ // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

+ // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

+ // if the upper bits of the i64 are zero.

+ if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

+ N0.getOperand(0).hasOneUse() &&

+ N0.getOperand(0).getValueType() == MVT::i64) {

+ SDValue In = N0.getOperand(0);

+ APInt Mask = APInt::getHighBitsSet(64, 32);

+ if (DAG.MaskedValueIsZero(In, Mask)) {

+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

+ MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

+ SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

+ return DAG.getBitcast(VT, Movl);

+ }

+ // Load a scalar integer constant directly to XMM instead of transferring an

+ // immediate value from GPR.

+ // vzext_movl (scalar_to_vector C) --> load [C,0...]

+ if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

+ if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

+ // Create a vector constant - scalar constant followed by zeros.

+ EVT ScalarVT = N0.getOperand(0).getValueType();

+ Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

+ unsigned NumElts = VT.getVectorNumElements();

+ Constant *Zero = ConstantInt::getNullValue(ScalarTy);

+ SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

+ ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

+ // Load the vector constant from constant pool.

+ MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

+ SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

+ MachinePointerInfo MPI =

+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

+ MachineMemOperand::MOLoad);

+ }

return SDValue();

}

case X86ISD::BLENDI: {

@@ -34667,6 +36257,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

}

return SDValue();

}

+ case X86ISD::VPERM2X128: {

+ // If both 128-bit values were inserted into high halves of 256-bit values,

+ // the shuffle can be reduced to a concatenation of subvectors:

+ // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y

+ // Note: We are only looking for the exact high/high shuffle mask because we

+ // expect to fold other similar patterns before creating this opcode.

+ SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));

+ SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));

+ unsigned Imm = N.getConstantOperandVal(2);

+ if (!(Imm == 0x31 &&

+ Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&

+ Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&

+ Ins0.getValueType() == Ins1.getValueType()))

+ return SDValue();

+ SDValue X = Ins0.getOperand(1);

+ SDValue Y = Ins1.getOperand(1);

+ unsigned C1 = Ins0.getConstantOperandVal(2);

+ unsigned C2 = Ins1.getConstantOperandVal(2);

+ MVT SrcVT = X.getSimpleValueType();

+ unsigned SrcElts = SrcVT.getVectorNumElements();

+ if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||

+ C1 != SrcElts || C2 != SrcElts)

+ return SDValue();

+ return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,

+ Ins1.getValueType(), X, Y));

+ }

case X86ISD::PSHUFD:

case X86ISD::PSHUFLW:

case X86ISD::PSHUFHW:

@@ -34706,8 +36324,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");

SDValue Op0 = N.getOperand(0);

SDValue Op1 = N.getOperand(1);

- SDValue Op2 = N.getOperand(2);

- unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();

+ unsigned InsertPSMask = N.getConstantOperandVal(2);

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

unsigned ZeroMask = InsertPSMask & 0xF;

@@ -34847,9 +36464,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

(V.getOpcode() == X86ISD::PSHUFLW ||

V.getOpcode() == X86ISD::PSHUFHW) &&

V.getOpcode() != N.getOpcode() &&

- V.hasOneUse()) {

+ V.hasOneUse() && V.getOperand(0).hasOneUse()) {

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

- if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {

+ if (D.getOpcode() == X86ISD::PSHUFD) {

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

@@ -35248,7 +36865,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

}

// Attempt to combine into a vector load/broadcast.

- if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))

+ if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,

+ Subtarget, true))

return LD;

// For AVX2, we sometimes want to combine

@@ -35281,79 +36899,100 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

return SDValue(N, 0);

}

- // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros

- // in the upper 64 bits.

- // TODO: Can we generalize this using computeKnownBits.

- if (N->getOpcode() == X86ISD::VZEXT_MOVL &&

- (VT == MVT::v2f64 || VT == MVT::v2i64) &&

- N->getOperand(0).getOpcode() == ISD::BITCAST &&

- (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||

- N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {

- SDValue In = N->getOperand(0).getOperand(0);

- switch (In.getOpcode()) {

- default:

- break;

- case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:

- case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:

- case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:

- case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:

- case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:

- case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:

- case X86ISD::VFPROUND: case X86ISD::VMFPROUND:

- if (In.getOperand(0).getValueType() == MVT::v2f64 ||

- In.getOperand(0).getValueType() == MVT::v2i64)

- return N->getOperand(0); // return the bitcast

- break;

- case X86ISD::STRICT_CVTTP2SI:

- case X86ISD::STRICT_CVTTP2UI:

- case X86ISD::STRICT_CVTSI2P:

- case X86ISD::STRICT_CVTUI2P:

- case X86ISD::STRICT_VFPROUND:

- if (In.getOperand(1).getValueType() == MVT::v2f64 ||

- In.getOperand(1).getValueType() == MVT::v2i64)

- return N->getOperand(0);

- break;

- }

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

// insert into a zero vector. This helps get VZEXT_MOVL closer to

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&

- N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&

- N->getOperand(0).hasOneUse() &&

- N->getOperand(0).getOperand(0).isUndef() &&

- isNullConstant(N->getOperand(0).getOperand(2))) {

- SDValue In = N->getOperand(0).getOperand(1);

- SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);

- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,

- getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),

- Movl, N->getOperand(0).getOperand(2));

- }

- // If this a vzmovl of a full vector load, replace it with a vzload, unless

- // the load is volatile.

- if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&

- ISD::isNormalLoad(N->getOperand(0).getNode())) {

- LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

- if (LN->isSimple()) {

- SDVTList Tys = DAG.getVTList(VT, MVT::Other);

- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

- SDValue VZLoad =

- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

- VT.getVectorElementType(),

- LN->getPointerInfo(),

- LN->getAlignment(),

- MachineMemOperand::MOLoad);

- DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

- return VZLoad;

+ N->getOperand(0).hasOneUse()) {

+ SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));

+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&

+ V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {

+ SDValue In = V.getOperand(1);

+ MVT SubVT =

+ MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

+ In.getValueSizeInBits() / VT.getScalarSizeInBits());

+ In = DAG.getBitcast(SubVT, In);

+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);

+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,

+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),

+ Movl, V.getOperand(2));

}

return SDValue();

}

+// Simplify variable target shuffle masks based on the demanded elements.

+// TODO: Handle DemandedBits in mask indices as well?

+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

+ SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

+ TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

+ // If we're demanding all elements don't bother trying to simplify the mask.

+ unsigned NumElts = DemandedElts.getBitWidth();

+ if (DemandedElts.isAllOnesValue())

+ return false;

+ SDValue Mask = Op.getOperand(MaskIndex);

+ if (!Mask.hasOneUse())

+ return false;

+ // Attempt to generically simplify the variable shuffle mask.

+ APInt MaskUndef, MaskZero;

+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

+ Depth + 1))

+ return true;

+ // Attempt to extract+simplify a (constant pool load) shuffle mask.

+ // TODO: Support other types from getTargetShuffleMaskIndices?

+ SDValue BC = peekThroughOneUseBitcasts(Mask);

+ EVT BCVT = BC.getValueType();

+ auto *Load = dyn_cast<LoadSDNode>(BC);

+ if (!Load)

+ return false;

+ const Constant *C = getTargetConstantFromNode(Load);

+ if (!C)

+ return false;

+ Type *CTy = C->getType();

+ if (!CTy->isVectorTy() ||

+ CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

+ return false;

+ // Handle scaling for i64 elements on 32-bit targets.

+ unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

+ if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

+ return false;

+ unsigned Scale = NumCstElts / NumElts;

+ // Simplify mask if we have an undemanded element that is not undef.

+ bool Simplified = false;

+ SmallVector<Constant *, 32> ConstVecOps;

+ for (unsigned i = 0; i != NumCstElts; ++i) {

+ Constant *Elt = C->getAggregateElement(i);

+ if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

+ ConstVecOps.push_back(UndefValue::get(Elt->getType()));

+ Simplified = true;

+ continue;

+ }

+ ConstVecOps.push_back(Elt);

+ }

+ if (!Simplified)

+ return false;

+ // Generate new constant pool entry + legalize immediately for the load.

+ SDLoc DL(Op);

+ SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

+ SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

+ SDValue NewMask = TLO.DAG.getLoad(

+ BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

+ MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

+ Load->getAlign());

+ return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

TargetLoweringOpt &TLO, unsigned Depth) const {

@@ -35523,12 +37162,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

// Aggressively peek through ops to get at the demanded elts.

// TODO - we should do this for all target/faux shuffles ops.

if (!DemandedElts.isAllOnesValue()) {

- APInt DemandedSrcBits =

- APInt::getAllOnesValue(N0.getScalarValueSizeInBits());

- SDValue NewN0 = SimplifyMultipleUseDemandedBits(

- N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);

- SDValue NewN1 = SimplifyMultipleUseDemandedBits(

- N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);

+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

+ TLO.DAG, Depth + 1);

+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

+ TLO.DAG, Depth + 1);

if (NewN0 || NewN1) {

NewN0 = NewN0 ? NewN0 : N0;

NewN1 = NewN1 ? NewN1 : N1;

@@ -35590,6 +37227,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

KnownUndef = LHSUndef & RHSUndef;

break;

}

+ case X86ISD::VZEXT_MOVL: {

+ // If upper demanded elements are already zero then we have nothing to do.

+ SDValue Src = Op.getOperand(0);

+ APInt DemandedUpperElts = DemandedElts;

+ DemandedUpperElts.clearLowBits(1);

+ if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())

+ return TLO.CombineTo(Op, Src);

+ break;

+ }

case X86ISD::VBROADCAST: {

SDValue Src = Op.getOperand(0);

MVT SrcVT = Src.getSimpleValueType();

@@ -35607,36 +37253,32 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

Depth + 1))

return true;

+ // Aggressively peek through src to get at the demanded elt.

+ // TODO - we should do this for all target/faux shuffles ops.

+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

+ Src, SrcElts, TLO.DAG, Depth + 1))

+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

break;

}

- case X86ISD::VPERMV: {

- SDValue Mask = Op.getOperand(0);

- APInt MaskUndef, MaskZero;

- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

- Depth + 1))

+ case X86ISD::VPERMV:

+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

+ Depth))

return true;

break;

- }

case X86ISD::PSHUFB:

case X86ISD::VPERMV3:

- case X86ISD::VPERMILPV: {

- SDValue Mask = Op.getOperand(1);

- APInt MaskUndef, MaskZero;

- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

- Depth + 1))

+ case X86ISD::VPERMILPV:

+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

+ Depth))

return true;

break;

- }

case X86ISD::VPPERM:

- case X86ISD::VPERMIL2: {

- SDValue Mask = Op.getOperand(2);

- APInt MaskUndef, MaskZero;

- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

- Depth + 1))

+ case X86ISD::VPERMIL2:

+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

+ Depth))

return true;

break;

}

- }

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

@@ -35651,18 +37293,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

ExtSizeInBits = SizeInBits / 4;

switch (Opc) {

- // Zero upper elements.

- case X86ISD::VZEXT_MOVL: {

- SDLoc DL(Op);

- SDValue Ext0 =

- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

- SDValue ExtOp =

- TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);

- SDValue UndefVec = TLO.DAG.getUNDEF(VT);

- SDValue Insert =

- insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

- return TLO.CombineTo(Op, Insert);

- }

// Subvector broadcast.

case X86ISD::SUBV_BROADCAST: {

SDLoc DL(Op);

@@ -35715,10 +37345,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

}

break;

}

- // Target Shuffles.

+ // Zero upper elements.

+ case X86ISD::VZEXT_MOVL:

+ // Target unary shuffles by immediate:

+ case X86ISD::PSHUFD:

+ case X86ISD::PSHUFLW:

+ case X86ISD::PSHUFHW:

+ case X86ISD::VPERMILPI:

+ // (Non-Lane Crossing) Target Shuffles.

+ case X86ISD::VPERMILPV:

+ case X86ISD::VPERMIL2:

case X86ISD::PSHUFB:

case X86ISD::UNPCKL:

case X86ISD::UNPCKH:

+ case X86ISD::BLENDI:

// Saturated Packs.

case X86ISD::PACKSS:

case X86ISD::PACKUS:

@@ -35728,14 +37368,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

case X86ISD::FHADD:

case X86ISD::FHSUB: {

SDLoc DL(Op);

+ SmallVector<SDValue, 4> Ops;

+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

+ SDValue SrcOp = Op.getOperand(i);

+ EVT SrcVT = SrcOp.getValueType();

+ assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&

+ "Unsupported vector size");

+ Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

+ ExtSizeInBits)

+ : SrcOp);

+ }

MVT ExtVT = VT.getSimpleVT();

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

ExtSizeInBits / ExtVT.getScalarSizeInBits());

- SDValue Ext0 =

- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

- SDValue Ext1 =

- extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);

- SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);

+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

SDValue Insert =

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

@@ -35832,6 +37478,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

unsigned Opc = Op.getOpcode();

switch(Opc) {

+ case X86ISD::VTRUNC: {

+ KnownBits KnownOp;

+ SDValue Src = Op.getOperand(0);

+ MVT SrcVT = Src.getSimpleValueType();

+ // Simplify the input, using demanded bit information.

+ APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

+ APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

+ if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

+ return true;

+ break;

+ }

case X86ISD::PMULDQ:

case X86ISD::PMULUDQ: {

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

@@ -35888,6 +37546,14 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

}

+ // If we are only demanding sign bits then we can use the shift source directly.

+ unsigned NumSignBits =

+ TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

+ unsigned UpperDemandedBits =

+ BitWidth - OriginalDemandedBits.countTrailingZeros();

+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

+ return TLO.CombineTo(Op, Op0);

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

TLO, Depth + 1))

return true;

@@ -36001,7 +37667,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

return TLO.CombineTo(

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

- Known = KnownVec.zext(BitWidth, true);

+ Known = KnownVec.zext(BitWidth);

return false;

}

break;

@@ -36054,6 +37720,17 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

KnownRHS, TLO, Depth + 1))

return true;

+ // Attempt to avoid multi-use ops if we don't need anything from them.

+ SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

+ Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

+ SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

+ Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

+ if (DemandedOp0 || DemandedOp1) {

+ SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

+ SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

+ }

}

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

break;

@@ -36086,16 +37763,51 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

// MOVMSK only uses the MSB from each vector element.

KnownBits KnownSrc;

- if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,

- KnownSrc, TLO, Depth + 1))

+ APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

+ if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

+ Depth + 1))

return true;

if (KnownSrc.One[SrcBits - 1])

Known.One.setLowBits(NumElts);

else if (KnownSrc.Zero[SrcBits - 1])

Known.Zero.setLowBits(NumElts);

+ // Attempt to avoid multi-use os if we don't need anything from it.

+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

+ Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

return false;

}

+ case X86ISD::BEXTR: {

+ SDValue Op0 = Op.getOperand(0);

+ SDValue Op1 = Op.getOperand(1);

+ // Only bottom 16-bits of the control bits are required.

+ if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

+ // NOTE: SimplifyDemandedBits won't do this for constants.

+ const APInt &Val1 = Cst1->getAPIntValue();

+ APInt MaskedVal1 = Val1 & 0xFFFF;

+ if (MaskedVal1 != Val1) {

+ SDLoc DL(Op);

+ return TLO.CombineTo(

+ Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

+ TLO.DAG.getConstant(MaskedVal1, DL, VT)));

+ }

+ KnownBits Known1;

+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

+ if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

+ return true;

+ // If the length is 0, replace with 0.

+ KnownBits LengthBits = Known1.extractBits(8, 8);

+ if (LengthBits.isZero())

+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

+ break;

+ }

}

return TargetLowering::SimplifyDemandedBitsForTargetNode(

@@ -36119,8 +37831,26 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

!DemandedElts[CIdx->getZExtValue()])

return Vec;

- break;

+ break;

}

+ case X86ISD::VSHLI: {

+ // If we are only demanding sign bits then we can use the shift source

+ // directly.

+ SDValue Op0 = Op.getOperand(0);

+ unsigned ShAmt = Op.getConstantOperandVal(1);

+ unsigned BitWidth = DemandedBits.getBitWidth();

+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

+ unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();

+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

+ return Op0;

+ break;

+ }

+ case X86ISD::VSRAI:

+ // iff we only need the sign bit then we can use the source directly.

+ // TODO: generalize where we only demand extended signbits.

+ if (DemandedBits.isSignMask())

+ return Op.getOperand(0);

+ break;

case X86ISD::PCMPGT:

// icmp sgt(0, R) == ashr(R, BitWidth-1).

// iff we only need the sign bit then we can use R directly.

@@ -36154,13 +37884,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

int M = ShuffleMask[i];

if (!DemandedElts[i] || ShuffleUndef[i])

continue;

- int Op = M / NumElts;

- int Index = M % NumElts;

- if (M < 0 || Index != i) {

+ int OpIdx = M / NumElts;

+ int EltIdx = M % NumElts;

+ if (M < 0 || EltIdx != i) {

IdentityOp.clearAllBits();

break;

}

- IdentityOp &= APInt::getOneBitSet(NumOps, Op);

+ IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

if (IdentityOp == 0)

break;

}

@@ -36191,6 +37921,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {

return false;

}

+// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

+static unsigned getAltBitOpcode(unsigned Opcode) {

+ switch(Opcode) {

+ case ISD::AND: return X86ISD::FAND;

+ case ISD::OR: return X86ISD::FOR;

+ case ISD::XOR: return X86ISD::FXOR;

+ case X86ISD::ANDNP: return X86ISD::FANDN;

+ }

+ llvm_unreachable("Unknown bitwise opcode");

+// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

+static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

+ const SDLoc &DL) {

+ EVT SrcVT = Src.getValueType();

+ if (SrcVT != MVT::v4i1)

+ return SDValue();

+ switch (Src.getOpcode()) {

+ case ISD::SETCC:

+ if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

+ SDValue Op0 = Src.getOperand(0);

+ if (ISD::isNormalLoad(Op0.getNode()))

+ return DAG.getBitcast(MVT::v4f32, Op0);

+ if (Op0.getOpcode() == ISD::BITCAST &&

+ Op0.getOperand(0).getValueType() == MVT::v4f32)

+ return Op0.getOperand(0);

+ }

+ break;

+ case ISD::AND:

+ case ISD::XOR:

+ case ISD::OR: {

+ SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

+ SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

+ if (Op0 && Op1)

+ return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

+ Op1);

+ break;

+ }

+ return SDValue();

// Helper to push sign extension of vXi1 SETCC result through bitops.

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

SDValue Src, const SDLoc &DL) {

@@ -36221,18 +37996,40 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

return SDValue();

+ // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

+ // legalization destroys the v4i32 type.

+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

+ if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

+ DAG.getBitcast(MVT::v4f32, V));

+ return DAG.getZExtOrTrunc(V, DL, VT);

+ }

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

// movmskb even with avx512. This will be better than truncating to vXi1 and

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

// vpcmpeqb/vpcmpgtb.

- bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

- (Src.getOperand(0).getValueType() == MVT::v16i8 ||

- Src.getOperand(0).getValueType() == MVT::v32i8 ||

- Src.getOperand(0).getValueType() == MVT::v64i8);

+ bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||

+ Src.getOperand(0).getValueType() == MVT::v32i8 ||

+ Src.getOperand(0).getValueType() == MVT::v64i8);

+ // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

+ // directly with vpmovmskb/vmovmskps/vmovmskpd.

+ if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

+ EVT CmpVT = Src.getOperand(0).getValueType();

+ EVT EltVT = CmpVT.getVectorElementType();

+ if (CmpVT.getSizeInBits() <= 256 &&

+ (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

+ PreferMovMsk = true;

+ }

// With AVX512 vxi1 types are legal and we prefer using k-regs.

// MOVMSK is supported in SSE2 or later.

- if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))

+ if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

return SDValue();

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

@@ -36288,7 +38085,14 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

case MVT::v64i1:

// If we have AVX512F, but not AVX512BW and the input is truncated from

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

- if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {

+ if (Subtarget.hasAVX512()) {

+ if (Subtarget.hasBWI())

+ return SDValue();

+ SExtVT = MVT::v64i8;

+ break;

+ }

+ // Split if this is a <64 x i8> comparison result.

+ if (checkBitcastSrcVectorSize(Src, 512)) {

SExtVT = MVT::v64i8;

break;

}

@@ -36458,6 +38262,74 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

return Ops[0];

}

+// Recursive function that attempts to find if a bool vector node was originally

+// a vector/float/double that got truncated/extended/bitcast to/from a scalar

+// integer. If so, replace the scalar ops with bool vector equivalents back down

+// the chain.

+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,

+ SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ unsigned Opc = V.getOpcode();

+ switch (Opc) {

+ case ISD::BITCAST: {

+ // Bitcast from a vector/float/double, we can cheaply bitcast to VT.

+ SDValue Src = V.getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ if (SrcVT.isVector() || SrcVT.isFloatingPoint())

+ return DAG.getBitcast(VT, Src);

+ break;

+ }

+ case ISD::TRUNCATE: {

+ // If we find a suitable source, a truncated scalar becomes a subvector.

+ SDValue Src = V.getOperand(0);

+ EVT NewSrcVT =

+ EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

+ if (TLI.isTypeLegal(NewSrcVT))

+ if (SDValue N0 =

+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

+ DAG.getIntPtrConstant(0, DL));

+ break;

+ }

+ case ISD::ANY_EXTEND:

+ case ISD::ZERO_EXTEND: {

+ // If we find a suitable source, an extended scalar becomes a subvector.

+ SDValue Src = V.getOperand(0);

+ EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

+ Src.getScalarValueSizeInBits());

+ if (TLI.isTypeLegal(NewSrcVT))

+ if (SDValue N0 =

+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

+ Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

+ : DAG.getConstant(0, DL, VT),

+ N0, DAG.getIntPtrConstant(0, DL));

+ break;

+ }

+ case ISD::OR: {

+ // If we find suitable sources, we can just move an OR to the vector domain.

+ SDValue Src0 = V.getOperand(0);

+ SDValue Src1 = V.getOperand(1);

+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

+ if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

+ return DAG.getNode(Opc, DL, VT, N0, N1);

+ break;

+ }

+ case ISD::SHL: {

+ // If we find a suitable source, a SHL becomes a KSHIFTL.

+ SDValue Src0 = V.getOperand(0);

+ if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

+ return DAG.getNode(

+ X86ISD::KSHIFTL, DL, VT, N0,

+ DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

+ break;

+ }

+ return SDValue();

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

@@ -36476,24 +38348,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

return V;

- // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type

- // legalization destroys the v4i32 type.

- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&

- VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&

- N0.getOperand(0).getValueType() == MVT::v4i32 &&

- ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&

- cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {

- SDValue N00 = N0.getOperand(0);

- // Only do this if we can avoid scalarizing the input.

- if (ISD::isNormalLoad(N00.getNode()) ||

- (N00.getOpcode() == ISD::BITCAST &&

- N00.getOperand(0).getValueType() == MVT::v4f32)) {

- SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,

- DAG.getBitcast(MVT::v4f32, N00));

- return DAG.getZExtOrTrunc(V, dl, VT);

- }

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

// type, widen both sides to avoid a trip through memory.

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

@@ -36535,6 +38389,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

N0 = DAG.getBitcast(MVT::i8, N0);

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

}

+ } else {

+ // If we're bitcasting from iX to vXi1, see if the integer originally

+ // began as a vXi1 and whether we can remove the bitcast entirely.

+ if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

+ SrcVT.isScalarInteger() &&

+ DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

+ if (SDValue V =

+ combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

+ return V;

+ }

}

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

@@ -36549,19 +38413,30 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

- // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT

- // determines // the number of bits loaded. Remaining bits are zero.

+ // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

+ // and the vbroadcast_load are both integer or both fp. In some cases this

+ // will remove the bitcast entirely.

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

- VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {

+ VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

auto *BCast = cast<MemIntrinsicSDNode>(N0);

- SDVTList Tys = DAG.getVTList(VT, MVT::Other);

- SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

- SDValue ResNode =

- DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

- VT.getVectorElementType(),

- BCast->getMemOperand());

- DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

- return ResNode;

+ unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

+ unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

+ // Don't swap i8/i16 since don't have fp types that size.

+ if (MemSize >= 32) {

+ MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

+ : MVT::getIntegerVT(MemSize);

+ MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

+ : MVT::getIntegerVT(SrcVTSize);

+ LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

+ SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

+ SDValue ResNode =

+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

+ MemVT, BCast->getMemOperand());

+ DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

+ return DAG.getBitcast(VT, ResNode);

+ }

}

// Since MMX types are special and don't usually play with other vector types,

@@ -36648,6 +38523,47 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

return DAG.getConstant(0, SDLoc(N0), VT);

}

+ // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

+ // Turn it into a sign bit compare that produces a k-register. This avoids

+ // a trip through a GPR.

+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

+ VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

+ isPowerOf2_32(VT.getVectorNumElements())) {

+ unsigned NumElts = VT.getVectorNumElements();

+ SDValue Src = N0;

+ // Peek through truncate.

+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

+ Src = N0.getOperand(0);

+ if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

+ SDValue MovmskIn = Src.getOperand(0);

+ MVT MovmskVT = MovmskIn.getSimpleValueType();

+ unsigned MovMskElts = MovmskVT.getVectorNumElements();

+ // We allow extra bits of the movmsk to be used since they are known zero.

+ // We can't convert a VPMOVMSKB without avx512bw.

+ if (MovMskElts <= NumElts &&

+ (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

+ EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

+ MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

+ SDLoc dl(N);

+ MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

+ SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

+ DAG.getConstant(0, dl, IntVT), ISD::SETLT);

+ if (EVT(CmpVT) == VT)

+ return Cmp;

+ // Pad with zeroes up to original VT to replace the zeroes that were

+ // being used from the MOVMSK.

+ unsigned NumConcats = NumElts / MovMskElts;

+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

+ Ops[0] = Cmp;

+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

+ }

// Try to remove bitcasts from input and output of mask arithmetic to

// remove GPR<->K-register crossings.

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

@@ -36772,12 +38688,9 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

while (SrcVT.getSizeInBits() > 128) {

- unsigned NumElts = SrcVT.getVectorNumElements();

- unsigned NumSubElts = NumElts / 2;

- SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);

- unsigned SubSizeInBits = SrcVT.getSizeInBits();

- SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);

- SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

+ SrcVT = Lo.getValueType();

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

}

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||

@@ -36864,6 +38777,25 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

Movmsk = DAG.getBitcast(MovmskVT, Match);

} else {

+ // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have

+ // PCMPEQQ (SSE41+), use PCMPEQD instead.

+ if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&

+ Match.getOpcode() == ISD::SETCC &&

+ ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&

+ cast<CondCodeSDNode>(Match.getOperand(2))->get() ==

+ ISD::CondCode::SETEQ) {

+ SDValue Vec = Match.getOperand(0);

+ if (Vec.getValueType().getScalarType() == MVT::i64 &&

+ (2 * NumElts) <= MaxElts) {

+ NumElts *= 2;

+ EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);

+ MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

+ Match = DAG.getSetCC(

+ DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),

+ DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);

+ }

// Use combineBitcastvxi1 to create the MOVMSK.

while (NumElts > MaxElts) {

SDValue Lo, Hi;

@@ -36878,10 +38810,7 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,

return SDValue();

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

} else {

- // Bail with AVX512VL (which uses predicate registers).

- if (Subtarget.hasVLX())

- return SDValue();

+ // FIXME: Better handling of k-registers or 512-bit vectors?

unsigned MatchSizeInBits = Match.getValueSizeInBits();

if (!(MatchSizeInBits == 128 ||

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

@@ -36958,21 +38887,14 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

if (!Subtarget.hasSSE2())

return SDValue();

- // Verify the type we're extracting from is any integer type above i16.

- EVT VT = Extract->getOperand(0).getValueType();

- if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))

+ EVT ExtractVT = Extract->getValueType(0);

+ // Verify the type we're extracting is either i32 or i64.

+ // FIXME: Could support other types, but this is what we have coverage for.

+ if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

return SDValue();

- unsigned RegSize = 128;

- if (Subtarget.useBWIRegs())

- RegSize = 512;

- else if (Subtarget.hasAVX())

- RegSize = 256;

- // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.

- // TODO: We should be able to handle larger vectors by splitting them before

- // feeding them into several SADs, and then reducing over those.

- if (RegSize / VT.getVectorNumElements() < 8)

+ EVT VT = Extract->getOperand(0).getValueType();

+ if (!isPowerOf2_32(VT.getVectorNumElements()))

return SDValue();

// Match shuffle + add pyramid.

@@ -36988,8 +38910,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

// (extends the sign bit which is zero).

// So it is correct to skip the sign/zero extend instruction.

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

- Root.getOpcode() == ISD::ZERO_EXTEND ||

- Root.getOpcode() == ISD::ANY_EXTEND))

+ Root.getOpcode() == ISD::ZERO_EXTEND ||

+ Root.getOpcode() == ISD::ANY_EXTEND))

Root = Root.getOperand(0);

// If there was a match, we want Root to be a select that is the root of an

@@ -37009,7 +38931,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

// If the original vector was wider than 8 elements, sum over the results

// in the SAD vector.

unsigned Stages = Log2_32(VT.getVectorNumElements());

- MVT SadVT = SAD.getSimpleValueType();

+ EVT SadVT = SAD.getValueType();

if (Stages > 3) {

unsigned SadElems = SadVT.getVectorNumElements();

@@ -37024,12 +38946,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

}

- MVT Type = Extract->getSimpleValueType(0);

- unsigned TypeSizeInBits = Type.getSizeInBits();

- // Return the lowest TypeSizeInBits bits.

- MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);

+ unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

+ // Return the lowest ExtractSizeInBits bits.

+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

+ SadVT.getSizeInBits() / ExtractSizeInBits);

SAD = DAG.getBitcast(ResVT, SAD);

- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,

+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

Extract->getOperand(1));

}

@@ -37048,19 +38970,34 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

EVT VT = N->getValueType(0);

EVT SrcVT = Src.getValueType();

EVT SrcSVT = SrcVT.getVectorElementType();

+ unsigned SrcEltBits = SrcSVT.getSizeInBits();

unsigned NumSrcElts = SrcVT.getVectorNumElements();

// Don't attempt this for boolean mask vectors or unknown extraction indices.

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

return SDValue();

+ const APInt &IdxC = N->getConstantOperandAPInt(1);

+ if (IdxC.uge(NumSrcElts))

+ return SDValue();

SDValue SrcBC = peekThroughBitcasts(Src);

- // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.

+ // Handle extract(bitcast(broadcast(scalar_value))).

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

SDValue SrcOp = SrcBC.getOperand(0);

- if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())

- return DAG.getBitcast(VT, SrcOp);

+ EVT SrcOpVT = SrcOp.getValueType();

+ if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

+ (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

+ unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

+ unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

+ // TODO support non-zero offsets.

+ if (Offset == 0) {

+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

+ return SrcOp;

+ }

}

// If we're extracting a single element from a broadcast load and there are

@@ -37069,22 +39006,43 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

- VT.getSizeInBits() == SrcBCWidth) {

+ VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

MemIntr->getBasePtr(),

MemIntr->getPointerInfo(),

- MemIntr->getAlignment(),

+ MemIntr->getOriginalAlign(),

MemIntr->getMemOperand()->getFlags());

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

return Load;

}

+ // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

+ // TODO: Move to DAGCombine?

+ if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

+ SrcBC.getValueType().isInteger() &&

+ (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

+ SrcBC.getScalarValueSizeInBits() ==

+ SrcBC.getOperand(0).getValueSizeInBits()) {

+ unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

+ if (IdxC.ult(Scale)) {

+ unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

+ SDValue Scl = SrcBC.getOperand(0);

+ EVT SclVT = Scl.getValueType();

+ if (Offset) {

+ Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

+ DAG.getShiftAmountConstant(Offset, SclVT, dl));

+ }

+ Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

+ Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

+ return Scl;

+ }

// Handle extract(truncate(x)) for 0'th index.

// TODO: Treat this as a faux shuffle?

// TODO: When can we use this for general indices?

- if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&

- isNullConstant(Idx)) {

+ if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

Src = DAG.getBitcast(SrcVT, Src);

return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);

@@ -37096,12 +39054,18 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

return SDValue();

+ // Shuffle inputs must be the same size as the result.

+ if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

+ return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

+ }))

+ return SDValue();

// Attempt to narrow/widen the shuffle mask to the correct size.

if (Mask.size() != NumSrcElts) {

if ((NumSrcElts % Mask.size()) == 0) {

SmallVector<int, 16> ScaledMask;

int Scale = NumSrcElts / Mask.size();

- scaleShuffleMask<int>(Scale, Mask, ScaledMask);

+ narrowShuffleMaskElts(Scale, Mask, ScaledMask);

Mask = std::move(ScaledMask);

} else if ((Mask.size() % NumSrcElts) == 0) {

// Simplify Mask based on demanded element.

@@ -37126,7 +39090,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

if (Mask.size() != NumSrcElts)

return SDValue();

- int SrcIdx = Mask[N->getConstantOperandVal(1)];

+ int SrcIdx = Mask[IdxC.getZExtValue()];

// If the shuffle source element is undef/zero then we can just accept it.

if (SrcIdx == SM_SentinelUndef)

@@ -37153,8 +39117,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {

- assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&

- "Unexpected extraction type");

+ assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");

unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

SrcOp = DAG.getBitcast(SrcVT, SrcOp);

SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,

@@ -37324,12 +39287,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,

// vXi8 reduction - sum lo/hi halves then use PSADBW.

if (VT == MVT::i8) {

while (Rdx.getValueSizeInBits() > 128) {

- unsigned HalfSize = VecVT.getSizeInBits() / 2;

- unsigned HalfElts = VecVT.getVectorNumElements() / 2;

- SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);

- SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);

- Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);

- VecVT = Rdx.getValueType();

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

+ VecVT = Lo.getValueType();

+ Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

}

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");

@@ -37344,8 +39305,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,

}

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

- if (!Subtarget.hasFastHorizontalOps() && !OptForSize)

+ if (!shouldUseHorizontalOp(true, DAG, Subtarget))

return SDValue();

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

@@ -37477,11 +39437,21 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

// and then testing the relevant element.

+ //

+ // Note that we only combine extracts on the *same* result number, i.e.

+ // t0 = merge_values a0, a1, a2, a3

+ // i1 = extract_vector_elt t0, Constant:i64<2>

+ // i1 = extract_vector_elt t0, Constant:i64<3>

+ // but not

+ // i1 = extract_vector_elt t0:1, Constant:i64<2>

+ // since the latter would need its own MOVMSK.

if (CIdx && SrcVT.getScalarType() == MVT::i1) {

SmallVector<SDNode *, 16> BoolExtracts;

- auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {

+ unsigned ResNo = InputVector.getResNo();

+ auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

isa<ConstantSDNode>(Use->getOperand(1)) &&

+ Use->getOperand(0).getResNo() == ResNo &&

Use->getValueType(0) == MVT::i1) {

BoolExtracts.push_back(Use);

return true;

@@ -37530,8 +39500,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

assert(CondVT.isVector() && "Vector select expects a vector selector!");

- // Check if the first operand is all zeros and Cond type is vXi1.

- // This situation only applies to avx512.

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

// TODO: Can we assert that both operands are not zeros (because that should

// get simplified at node creation time)?

@@ -37546,14 +39514,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

return DAG.getConstant(0, DL, VT);

}

- if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&

- Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {

- // Invert the cond to not(cond) : xor(op,allones)=not(op)

- SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

- // Vselect cond, op1, op2 = Vselect not(cond), op2, op1

- return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

- }

// To use the condition operand as a bitwise mask, it must have elements that

// are the same size as the select elements. Ie, the condition operand must

// have already been promoted from the IR select condition type <N x i1>.

@@ -37778,12 +39738,13 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

return true;

};

+ APInt DemandedBits(APInt::getSignMask(BitWidth));

if (OnlyUsedAsSelectCond(Cond)) {

- APInt DemandedMask(APInt::getSignMask(BitWidth));

KnownBits Known;

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

!DCI.isBeforeLegalizeOps());

- if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))

+ if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

return SDValue();

// If we changed the computation somewhere in the DAG, this change will

@@ -37805,15 +39766,9 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

}

// Otherwise we can still at least try to simplify multiple use bits.

- APInt DemandedMask(APInt::getSignMask(BitWidth));

- APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));

- KnownBits Known;

- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

- !DCI.isBeforeLegalizeOps());

- if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,

- DemandedElts, DAG, 0))

- return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),

- V, N->getOperand(1), N->getOperand(2));

+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

+ N->getOperand(1), N->getOperand(2));

return SDValue();

}

@@ -38297,6 +40252,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

}

+ // Check if the first operand is all zeros and Cond type is vXi1.

+ // If this an avx512 target we can improve the use of zero masking by

+ // swapping the operands and inverting the condition.

+ if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

+ Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

+ ISD::isBuildVectorAllZeros(LHS.getNode()) &&

+ !ISD::isBuildVectorAllZeros(RHS.getNode())) {

+ // Invert the cond to not(cond) : xor(op,allones)=not(op)

+ SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

+ // Vselect cond, op1, op2 = Vselect not(cond), op2, op1

+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

+ }

// Early exit check

if (!TLI.isTypeLegal(VT))

return SDValue();

@@ -38316,12 +40284,86 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

return DAG.getNode(N->getOpcode(), DL, VT,

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

- // Custom action for SELECT MMX

- if (VT == MVT::x86mmx) {

- LHS = DAG.getBitcast(MVT::i64, LHS);

- RHS = DAG.getBitcast(MVT::i64, RHS);

- SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);

- return DAG.getBitcast(VT, newSelect);

+ // Try to optimize vXi1 selects if both operands are either all constants or

+ // bitcasts from scalar integer type. In that case we can convert the operands

+ // to integer and use an integer select which will be converted to a CMOV.

+ // We need to take a little bit of care to avoid creating an i64 type after

+ // type legalization.

+ if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

+ VT.getVectorElementType() == MVT::i1 &&

+ (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

+ bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

+ bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

+ if ((LHSIsConst ||

+ (LHS.getOpcode() == ISD::BITCAST &&

+ LHS.getOperand(0).getValueType() == IntVT)) &&

+ (RHSIsConst ||

+ (RHS.getOpcode() == ISD::BITCAST &&

+ RHS.getOperand(0).getValueType() == IntVT))) {

+ if (LHSIsConst)

+ LHS = combinevXi1ConstantToInteger(LHS, DAG);

+ else

+ LHS = LHS.getOperand(0);

+ if (RHSIsConst)

+ RHS = combinevXi1ConstantToInteger(RHS, DAG);

+ else

+ RHS = RHS.getOperand(0);

+ SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

+ return DAG.getBitcast(VT, Select);

+ }

+ // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

+ // single bits, then invert the predicate and swap the select operands.

+ // This can lower using a vector shift bit-hack rather than mask and compare.

+ if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

+ Cond.getOperand(0).getOpcode() == ISD::AND &&

+ isNullOrNullSplat(Cond.getOperand(1)) &&

+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

+ Cond.getOperand(0).getValueType() == VT) {

+ // The 'and' mask must be composed of power-of-2 constants.

+ SDValue And = Cond.getOperand(0);

+ auto *C = isConstOrConstSplat(And.getOperand(1));

+ if (C && C->getAPIntValue().isPowerOf2()) {

+ // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

+ SDValue NotCond =

+ DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

+ return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

+ }

+ // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

+ // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

+ // 16-bit lacks a proper blendv.

+ unsigned EltBitWidth = VT.getScalarSizeInBits();

+ bool CanShiftBlend =

+ TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

+ (Subtarget.hasAVX2() && EltBitWidth == 64) ||

+ (Subtarget.hasXOP()));

+ if (CanShiftBlend &&

+ ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

+ return C->getAPIntValue().isPowerOf2();

+ })) {

+ // Create a left-shift constant to get the mask bits over to the sign-bit.

+ SDValue Mask = And.getOperand(1);

+ SmallVector<int, 32> ShlVals;

+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

+ auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

+ ShlVals.push_back(EltBitWidth - 1 -

+ MaskVal->getAPIntValue().exactLogBase2());

+ }

+ // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

+ SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

+ SDValue NewCond =

+ DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

+ return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

+ }

}

return SDValue();

@@ -38647,6 +40689,282 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

return SDValue();

}

+/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

+/// to avoid the inversion.

+static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

+ SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

+ if (EFLAGS.getOpcode() != X86ISD::PTEST &&

+ EFLAGS.getOpcode() != X86ISD::TESTP)

+ return SDValue();

+ // PTEST/TESTP sets EFLAGS as:

+ // TESTZ: ZF = (Op0 & Op1) == 0

+ // TESTC: CF = (~Op0 & Op1) == 0

+ // TESTNZC: ZF == 0 && CF == 0

+ EVT VT = EFLAGS.getValueType();

+ SDValue Op0 = EFLAGS.getOperand(0);

+ SDValue Op1 = EFLAGS.getOperand(1);

+ EVT OpVT = Op0.getValueType();

+ // TEST*(~X,Y) == TEST*(X,Y)

+ if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

+ X86::CondCode InvCC;

+ switch (CC) {

+ case X86::COND_B:

+ // testc -> testz.

+ InvCC = X86::COND_E;

+ break;

+ case X86::COND_AE:

+ // !testc -> !testz.

+ InvCC = X86::COND_NE;

+ break;

+ case X86::COND_E:

+ // testz -> testc.

+ InvCC = X86::COND_B;

+ break;

+ case X86::COND_NE:

+ // !testz -> !testc.

+ InvCC = X86::COND_AE;

+ break;

+ case X86::COND_A:

+ case X86::COND_BE:

+ // testnzc -> testnzc (no change).

+ InvCC = CC;

+ break;

+ default:

+ InvCC = X86::COND_INVALID;

+ break;

+ }

+ if (InvCC != X86::COND_INVALID) {

+ CC = InvCC;

+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

+ DAG.getBitcast(OpVT, NotOp0), Op1);

+ }

+ if (CC == X86::COND_E || CC == X86::COND_NE) {

+ // TESTZ(X,~Y) == TESTC(Y,X)

+ if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

+ DAG.getBitcast(OpVT, NotOp1), Op0);

+ }

+ if (Op0 == Op1) {

+ SDValue BC = peekThroughBitcasts(Op0);

+ EVT BCVT = BC.getValueType();

+ assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

+ "Unexpected vector type");

+ // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

+ if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

+ DAG.getBitcast(OpVT, BC.getOperand(0)),

+ DAG.getBitcast(OpVT, BC.getOperand(1)));

+ }

+ // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

+ if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

+ DAG.getBitcast(OpVT, BC.getOperand(0)),

+ DAG.getBitcast(OpVT, BC.getOperand(1)));

+ }

+ // If every element is an all-sign value, see if we can use MOVMSK to

+ // more efficiently extract the sign bits and compare that.

+ // TODO: Handle TESTC with comparison inversion.

+ // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

+ // MOVMSK combines to make sure its never worse than PTEST?

+ unsigned EltBits = BCVT.getScalarSizeInBits();

+ if (DAG.ComputeNumSignBits(BC) == EltBits) {

+ assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");

+ APInt SignMask = APInt::getSignMask(EltBits);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ if (SDValue Res =

+ TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

+ // For vXi16 cases we need to use pmovmksb and extract every other

+ // sign bit.

+ SDLoc DL(EFLAGS);

+ if (EltBits == 16) {

+ MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

+ Res = DAG.getBitcast(MovmskVT, Res);

+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

+ Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

+ } else {

+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

+ }

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

+ DAG.getConstant(0, DL, MVT::i32));

+ }

+ // TESTZ(-1,X) == TESTZ(X,X)

+ if (ISD::isBuildVectorAllOnes(Op0.getNode()))

+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

+ // TESTZ(X,-1) == TESTZ(X,X)

+ if (ISD::isBuildVectorAllOnes(Op1.getNode()))

+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

+ }

+ return SDValue();

+// Attempt to simplify the MOVMSK input based on the comparison type.

+static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

+ SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ // Handle eq/ne against zero (any_of).

+ // Handle eq/ne against -1 (all_of).

+ if (!(CC == X86::COND_E || CC == X86::COND_NE))

+ return SDValue();

+ if (EFLAGS.getValueType() != MVT::i32)

+ return SDValue();

+ unsigned CmpOpcode = EFLAGS.getOpcode();

+ if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

+ return SDValue();

+ auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

+ if (!CmpConstant)

+ return SDValue();

+ const APInt &CmpVal = CmpConstant->getAPIntValue();

+ SDValue CmpOp = EFLAGS.getOperand(0);

+ unsigned CmpBits = CmpOp.getValueSizeInBits();

+ assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");

+ // Peek through any truncate.

+ if (CmpOp.getOpcode() == ISD::TRUNCATE)

+ CmpOp = CmpOp.getOperand(0);

+ // Bail if we don't find a MOVMSK.

+ if (CmpOp.getOpcode() != X86ISD::MOVMSK)

+ return SDValue();

+ SDValue Vec = CmpOp.getOperand(0);

+ MVT VecVT = Vec.getSimpleValueType();

+ assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&

+ "Unexpected MOVMSK operand");

+ unsigned NumElts = VecVT.getVectorNumElements();

+ unsigned NumEltBits = VecVT.getScalarSizeInBits();

+ bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();

+ bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&

+ CmpVal.isMask(NumElts);

+ if (!IsAnyOf && !IsAllOf)

+ return SDValue();

+ // See if we can peek through to a vector with a wider element type, if the

+ // signbits extend down to all the sub-elements as well.

+ // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

+ // potential SimplifyDemandedBits/Elts cases.

+ if (Vec.getOpcode() == ISD::BITCAST) {

+ SDValue BC = peekThroughBitcasts(Vec);

+ MVT BCVT = BC.getSimpleValueType();

+ unsigned BCNumElts = BCVT.getVectorNumElements();

+ unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

+ if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

+ BCNumEltBits > NumEltBits &&

+ DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

+ SDLoc DL(EFLAGS);

+ unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

+ DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

+ DAG.getConstant(CmpMask, DL, MVT::i32));

+ }

+ // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

+ // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

+ if (IsAllOf && Subtarget.hasSSE41()) {

+ SDValue BC = peekThroughBitcasts(Vec);

+ if (BC.getOpcode() == X86ISD::PCMPEQ &&

+ ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {

+ MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

+ SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));

+ return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

+ }

+ // See if we can avoid a PACKSS by calling MOVMSK on the sources.

+ // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

+ // sign bits prior to the comparison with zero unless we know that

+ // the vXi16 splats the sign bit down to the lower i8 half.

+ // TODO: Handle all_of patterns.

+ if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

+ SDValue VecOp0 = Vec.getOperand(0);

+ SDValue VecOp1 = Vec.getOperand(1);

+ bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

+ bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

+ // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

+ if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

+ SDLoc DL(EFLAGS);

+ SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

+ Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

+ if (!SignExt0) {

+ Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

+ DAG.getConstant(0xAAAA, DL, MVT::i16));

+ }

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

+ DAG.getConstant(0, DL, MVT::i16));

+ }

+ // PMOVMSKB(PACKSSBW(LO(X), HI(X)))

+ // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

+ if (CmpBits == 16 && Subtarget.hasInt256() &&

+ VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

+ VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

+ VecOp0.getOperand(0) == VecOp1.getOperand(0) &&

+ VecOp0.getConstantOperandAPInt(1) == 0 &&

+ VecOp1.getConstantOperandAPInt(1) == 8 &&

+ (IsAnyOf || (SignExt0 && SignExt1))) {

+ SDLoc DL(EFLAGS);

+ SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));

+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

+ unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

+ if (!SignExt0 || !SignExt1) {

+ assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");

+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

+ }

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

+ DAG.getConstant(CmpMask, DL, MVT::i32));

+ }

+ // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

+ SmallVector<int, 32> ShuffleMask;

+ SmallVector<SDValue, 2> ShuffleInputs;

+ if (NumElts == CmpBits &&

+ getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

+ ShuffleMask, DAG) &&

+ ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

+ ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

+ unsigned NumShuffleElts = ShuffleMask.size();

+ APInt DemandedElts = APInt::getNullValue(NumShuffleElts);

+ for (int M : ShuffleMask) {

+ assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");

+ DemandedElts.setBit(M);

+ }

+ if (DemandedElts.isAllOnesValue()) {

+ SDLoc DL(EFLAGS);

+ SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

+ Result =

+ DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

+ EFLAGS.getOperand(1));

+ }

+ return SDValue();

/// Optimize an EFLAGS definition used according to the condition code \p CC

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

/// uses of chain values.

@@ -38659,6 +40977,13 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

return R;

+ if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

+ return R;

+ if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

+ return R;

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

}

@@ -38680,7 +41005,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

// Try to simplify the EFLAGS and condition code operands.

// We can't always do this as FCMOV only supports a subset of X86 cond.

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

- if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {

+ if (!(FalseOp.getValueType() == MVT::f80 ||

+ (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

+ (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

+ !Subtarget.hasCMov() || hasFPCMov(CC)) {

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

Flags};

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

@@ -38989,7 +41317,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

: ISD::SIGN_EXTEND,

DL, VT, MulLo);

- MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);

+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

// the higher part is also needed.

SDValue MulHi =

@@ -39120,10 +41448,14 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

return SDValue();

- // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.

- // Also allow v2i32 if it will be widened.

+ // Make sure the type is legal or will be widened to a legal type.

+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))

+ return SDValue();

MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());

- if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))

+ // Without BWI, we would need to split v32i16.

+ if (WVT == MVT::v32i16 && !Subtarget.hasBWI())

return SDValue();

SDValue N0 = N->getOperand(0);

@@ -39340,6 +41672,64 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

return NewMul;

}

+// Try to form a MULHU or MULHS node by looking for

+// (srl (mul ext, ext), 16)

+// TODO: This is X86 specific because we want to be able to handle wide types

+// before type legalization. But we can only do it if the vector will be

+// legalized via widening/splitting. Type legalization can't handle promotion

+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

+// combiner.

+static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&

+ "SRL or SRA node is required here!");

+ SDLoc DL(N);

+ // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand

+ // the multiply.

+ if (!Subtarget.hasSSE41())

+ return SDValue();

+ // The operation feeding into the shift must be a multiply.

+ SDValue ShiftOperand = N->getOperand(0);

+ if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

+ return SDValue();

+ // Input type should be at least vXi32.

+ EVT VT = N->getValueType(0);

+ if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

+ return SDValue();

+ // Need a shift by 16.

+ APInt ShiftAmt;

+ if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

+ ShiftAmt != 16)

+ return SDValue();

+ SDValue LHS = ShiftOperand.getOperand(0);

+ SDValue RHS = ShiftOperand.getOperand(1);

+ unsigned ExtOpc = LHS.getOpcode();

+ if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

+ RHS.getOpcode() != ExtOpc)

+ return SDValue();

+ // Peek through the extends.

+ LHS = LHS.getOperand(0);

+ RHS = RHS.getOperand(0);

+ // Ensure the input types match.

+ EVT MulVT = LHS.getValueType();

+ if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

+ return SDValue();

+ unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

+ SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

+ ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

+ return DAG.getNode(ExtOpc, DL, VT, Mulh);

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

@@ -39399,12 +41789,16 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

return SDValue();

}

-static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {

+static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

EVT VT = N0.getValueType();

unsigned Size = VT.getSizeInBits();

+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

+ return V;

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

@@ -39453,11 +41847,15 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {

}

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

- TargetLowering::DAGCombinerInfo &DCI) {

+ TargetLowering::DAGCombinerInfo &DCI,

+ const X86Subtarget &Subtarget) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

EVT VT = N0.getValueType();

+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

+ return V;

// Only do this on the last DAG combine as it can interfere with other

// combines.

if (!DCI.isAfterLegalizeDAG())

@@ -39501,16 +41899,92 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {

+ unsigned Opcode = N->getOpcode();

+ assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&

+ "Unexpected pack opcode");

+ EVT VT = N->getValueType(0);

+ SDValue N0 = N->getOperand(0);

+ SDValue N1 = N->getOperand(1);

+ unsigned NumDstElts = VT.getVectorNumElements();

+ // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

+ // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

+ // truncation trees that help us avoid lane crossing shuffles.

+ // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

+ N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

+ N0.getConstantOperandAPInt(1) == 0 &&

+ N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&

+ N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&

+ N0.getOperand(0).getValueType().is256BitVector()) {

+ // TODO - support target/faux shuffles.

+ SDValue Vec = peekThroughBitcasts(N0.getOperand(0));

+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {

+ // To keep the PACK LHS/RHS coherency, we must be able to scale the unary

+ // shuffle to a vXi64 width - we can probably relax this in the future.

+ SmallVector<int, 4> ShuffleMask;

+ if (SVN->getOperand(1).isUndef() &&

+ scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {

+ SDLoc DL(N);

+ SDValue Lo, Hi;

+ std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);

+ Lo = DAG.getBitcast(N0.getValueType(), Lo);

+ Hi = DAG.getBitcast(N1.getValueType(), Hi);

+ SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

+ Res = DAG.getBitcast(MVT::v4i32, Res);

+ Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);

+ return DAG.getBitcast(VT, Res);

+ }

+ // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).

+ // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.

+ if (VT.is256BitVector()) {

+ if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {

+ if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {

+ SmallVector<int, 2> ShuffleMask0, ShuffleMask1;

+ if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&

+ scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {

+ SDValue Op00 = SVN0->getOperand(0);

+ SDValue Op01 = SVN0->getOperand(1);

+ SDValue Op10 = SVN1->getOperand(0);

+ SDValue Op11 = SVN1->getOperand(1);

+ if ((Op00 == Op11) && (Op01 == Op10)) {

+ std::swap(Op10, Op11);

+ ShuffleVectorSDNode::commuteMask(ShuffleMask1);

+ }

+ if ((Op00 == Op10) && (Op01 == Op11)) {

+ SmallVector<int, 4> ShuffleMask;

+ ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());

+ ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());

+ SDLoc DL(N);

+ SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);

+ Res = DAG.getBitcast(MVT::v4i64, Res);

+ Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);

+ return DAG.getBitcast(VT, Res);

+ }

+ return SDValue();

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

unsigned Opcode = N->getOpcode();

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&

- "Unexpected shift opcode");

+ "Unexpected pack opcode");

EVT VT = N->getValueType(0);

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

+ unsigned NumDstElts = VT.getVectorNumElements();

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&

@@ -39527,7 +42001,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

unsigned NumLanes = VT.getSizeInBits() / 128;

- unsigned NumDstElts = VT.getVectorNumElements();

unsigned NumSrcElts = NumDstElts / 2;

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

@@ -39574,6 +42047,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

}

+ // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

+ if (SDValue V = combineVectorPackWithShuffle(N, DAG))

+ return V;

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

// truncate to create a larger truncate.

if (Subtarget.hasAVX512() &&

@@ -39656,26 +42133,37 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

if (ShiftVal >= NumBitsPerElt) {

if (LogicalShift)

return DAG.getConstant(0, SDLoc(N), VT);

- else

- ShiftVal = NumBitsPerElt - 1;

+ ShiftVal = NumBitsPerElt - 1;

}

- // Shift N0 by zero -> N0.

+ // (shift X, 0) -> X

if (!ShiftVal)

return N0;

- // Shift zero -> zero.

+ // (shift 0, C) -> 0

if (ISD::isBuildVectorAllZeros(N0.getNode()))

+ // N0 is all zeros or undef. We guarantee that the bits shifted into the

+ // result are all zeros, not undef.

return DAG.getConstant(0, SDLoc(N), VT);

- // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)

- // clamped to (NumBitsPerElt - 1).

- if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {

+ // (VSRAI -1, C) -> -1

+ if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

+ // N0 is all ones or undef. We guarantee that the bits shifted into the

+ // result are all ones, not undef.

+ return DAG.getConstant(-1, SDLoc(N), VT);

+ // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

+ if (Opcode == N0.getOpcode()) {

unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();

unsigned NewShiftVal = ShiftVal + ShiftVal2;

- if (NewShiftVal >= NumBitsPerElt)

+ if (NewShiftVal >= NumBitsPerElt) {

+ // Out of range logical bit shifts are guaranteed to be zero.

+ // Out of range arithmetic bit shifts splat the sign bit.

+ if (LogicalShift)

+ return DAG.getConstant(0, SDLoc(N), VT);

NewShiftVal = NumBitsPerElt - 1;

- return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),

+ }

+ return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

}

@@ -39693,14 +42181,22 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {

assert(EltBits.size() == VT.getVectorNumElements() &&

"Unexpected shift value type");

- for (APInt &Elt : EltBits) {

- if (X86ISD::VSHLI == Opcode)

+ // Undef elements need to fold to 0. It's possible SimplifyDemandedBits

+ // created an undef input due to no input bits being demanded, but user

+ // still expects 0 in other bits.

+ for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

+ APInt &Elt = EltBits[i];

+ if (UndefElts[i])

+ Elt = 0;

+ else if (X86ISD::VSHLI == Opcode)

Elt <<= ShiftVal;

else if (X86ISD::VSRAI == Opcode)

Elt.ashrInPlace(ShiftVal);

else

Elt.lshrInPlace(ShiftVal);

}

+ // Reset undef elements since they were zeroed above.

+ UndefElts = 0;

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

}

@@ -39717,19 +42213,24 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

EVT VT = N->getValueType(0);

assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||

- (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&

+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||

+ N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&

"Unexpected vector insertion");

- unsigned NumBitsPerElt = VT.getScalarSizeInBits();

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

- if (TLI.SimplifyDemandedBits(SDValue(N, 0),

- APInt::getAllOnesValue(NumBitsPerElt), DCI))

- return SDValue(N, 0);

+ if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {

+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),

+ APInt::getAllOnesValue(NumBitsPerElt), DCI))

+ return SDValue(N, 0);

+ }

- // Attempt to combine PINSRB/PINSRW patterns to a shuffle.

- SDValue Op(N, 0);

- if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

- return Res;

+ // Attempt to combine insertion patterns to a shuffle.

+ if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

+ SDValue Op(N, 0);

+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

+ return Res;

+ }

return SDValue();

}

@@ -39752,7 +42253,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

SDLoc DL(N);

// The SETCCs should both refer to the same CMP.

- if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)

+ if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

return SDValue();

SDValue CMP00 = CMP0->getOperand(0);

@@ -39851,10 +42352,27 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

- if (SDValue Not = IsNOT(N0, DAG)) {

+ auto GetNot = [&VT, &DAG](SDValue V) {

+ // Basic X = NOT(Y) detection.

+ if (SDValue Not = IsNOT(V, DAG))

+ return Not;

+ // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).

+ if (V.getOpcode() == X86ISD::VBROADCAST) {

+ SDValue Src = V.getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ if (!SrcVT.isVector())

+ return SDValue();

+ if (SDValue Not = IsNOT(Src, DAG))

+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,

+ DAG.getBitcast(SrcVT, Not));

+ }

+ return SDValue();

+ };

+ if (SDValue Not = GetNot(N0)) {

X = Not;

Y = N1;

- } else if (SDValue Not = IsNOT(N1, DAG)) {

+ } else if (SDValue Not = GetNot(N1)) {

X = Not;

Y = N0;

} else

@@ -39865,6 +42383,65 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

}

+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

+// logical operations, like in the example below.

+// or (and (truncate x, truncate y)),

+// (xor (truncate z, build_vector (constants)))

+// Given a target type \p VT, we generate

+// or (and x, y), (xor z, zext(build_vector (constants)))

+// given x, y and z are of type \p VT. We can do so, if operands are either

+// truncates from VT types, the second operand is a vector of constants or can

+// be recursively promoted.

+static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

+ unsigned Depth) {

+ // Limit recursion to avoid excessive compile times.

+ if (Depth >= SelectionDAG::MaxRecursionDepth)

+ return SDValue();

+ if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

+ N->getOpcode() != ISD::OR)

+ return SDValue();

+ SDValue N0 = N->getOperand(0);

+ SDValue N1 = N->getOperand(1);

+ SDLoc DL(N);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

+ return SDValue();

+ if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

+ N0 = NN0;

+ else {

+ // The Left side has to be a trunc.

+ if (N0.getOpcode() != ISD::TRUNCATE)

+ return SDValue();

+ // The type of the truncated inputs.

+ if (N0.getOperand(0).getValueType() != VT)

+ return SDValue();

+ N0 = N0.getOperand(0);

+ }

+ if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

+ N1 = NN1;

+ else {

+ // The right side has to be a 'trunc' or a constant vector.

+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

+ N1.getOperand(0).getValueType() == VT;

+ if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

+ return SDValue();

+ if (RHSTrunc)

+ N1 = N1.getOperand(0);

+ else

+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

+ }

+ return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

// register. In most cases we actually compare or select YMM-sized registers

// and mixing the two types creates horrible code. This method optimizes

@@ -39876,6 +42453,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

EVT VT = N->getValueType(0);

assert(VT.isVector() && "Expected vector type");

+ SDLoc DL(N);

assert((N->getOpcode() == ISD::ANY_EXTEND ||

N->getOpcode() == ISD::ZERO_EXTEND ||

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

@@ -39883,57 +42461,33 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

SDValue Narrow = N->getOperand(0);

EVT NarrowVT = Narrow.getValueType();

- if (Narrow->getOpcode() != ISD::XOR &&

- Narrow->getOpcode() != ISD::AND &&

- Narrow->getOpcode() != ISD::OR)

- return SDValue();

- SDValue N0 = Narrow->getOperand(0);

- SDValue N1 = Narrow->getOperand(1);

- SDLoc DL(Narrow);

- // The Left side has to be a trunc.

- if (N0.getOpcode() != ISD::TRUNCATE)

- return SDValue();

- // The type of the truncated inputs.

- if (N0.getOperand(0).getValueType() != VT)

- return SDValue();

- // The right side has to be a 'trunc' or a constant vector.

- bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

- N1.getOperand(0).getValueType() == VT;

- if (!RHSTrunc &&

- !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

- return SDValue();

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

- if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))

- return SDValue();

- // Set N0 and N1 to hold the inputs to the new wide operation.

- N0 = N0.getOperand(0);

- if (RHSTrunc)

- N1 = N1.getOperand(0);

- else

- N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

// Generate the wide operation.

- SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);

- unsigned Opcode = N->getOpcode();

- switch (Opcode) {

+ SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

+ if (!Op)

+ return SDValue();

+ switch (N->getOpcode()) {

default: llvm_unreachable("Unexpected opcode");

case ISD::ANY_EXTEND:

return Op;

case ISD::ZERO_EXTEND:

- return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());

+ return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

case ISD::SIGN_EXTEND:

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

Op, DAG.getValueType(NarrowVT));

}

+static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

+ unsigned FPOpcode;

+ switch (Opcode) {

+ default: llvm_unreachable("Unexpected input node for FP logic conversion");

+ case ISD::AND: FPOpcode = X86ISD::FAND; break;

+ case ISD::OR: FPOpcode = X86ISD::FOR; break;

+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

+ }

+ return FPOpcode;

/// If both input operands of a logic op are being cast from floating point

/// types, try to convert this into a floating point logic node to avoid

/// unnecessary moves from SSE to integer registers.

@@ -39958,18 +42512,45 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

(Subtarget.hasSSE2() && N00Type == MVT::f64)))

return SDValue();

- unsigned FPOpcode;

- switch (N->getOpcode()) {

- default: llvm_unreachable("Unexpected input node for FP logic conversion");

- case ISD::AND: FPOpcode = X86ISD::FAND; break;

- case ISD::OR: FPOpcode = X86ISD::FOR; break;

- case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

- }

+ unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

return DAG.getBitcast(VT, FPLogic);

}

+// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

+// to reduce XMM->GPR traffic.

+static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

+ unsigned Opc = N->getOpcode();

+ assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&

+ "Unexpected bit opcode");

+ SDValue N0 = N->getOperand(0);

+ SDValue N1 = N->getOperand(1);

+ // Both operands must be single use MOVMSK.

+ if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

+ N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

+ return SDValue();

+ SDValue Vec0 = N0.getOperand(0);

+ SDValue Vec1 = N1.getOperand(0);

+ EVT VecVT0 = Vec0.getValueType();

+ EVT VecVT1 = Vec1.getValueType();

+ // Both MOVMSK operands must be from vectors of the same size and same element

+ // size, but its OK for a fp/int diff.

+ if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

+ VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

+ return SDValue();

+ SDLoc DL(N);

+ unsigned VecOpc =

+ VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

+ SDValue Result =

+ DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

+ return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

/// with a shift-right to eliminate loading the vector constant mask value.

@@ -40292,7 +42873,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

// TODO: Support multiple SrcOps.

if (VT == MVT::i1) {

SmallVector<SDValue, 2> SrcOps;

- if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&

+ SmallVector<APInt, 2> SrcPartials;

+ if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

SrcOps.size() == 1) {

SDLoc dl(N);

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

@@ -40302,9 +42884,11 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

if (Mask) {

- APInt AllBits = APInt::getAllOnesValue(NumElts);

- return DAG.getSetCC(dl, MVT::i1, Mask,

- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);

+ assert(SrcPartials[0].getBitWidth() == NumElts &&

+ "Unexpected partial reduction mask");

+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

+ return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

}

@@ -40312,6 +42896,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

return V;

+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

+ return R;

if (DCI.isBeforeLegalizeOps())

return SDValue();

@@ -40420,6 +43007,16 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

}

SDLoc DL(N);

+ if (UseVPTERNLOG) {

+ // Emit a VPTERNLOG node directly.

+ SDValue A = DAG.getBitcast(VT, N0.getOperand(1));

+ SDValue B = DAG.getBitcast(VT, N0.getOperand(0));

+ SDValue C = DAG.getBitcast(VT, N1.getOperand(0));

+ SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

+ return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);

+ }

SDValue X = N->getOperand(0);

SDValue Y =

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

@@ -40503,6 +43100,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

if (!Subtarget.hasSSE41())

return SDValue();

+ // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

+ if (Subtarget.hasVLX())

+ return SDValue();

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

X = DAG.getBitcast(BlendVT, X);

@@ -40619,139 +43220,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

return Ret;

}

-static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,

- const X86Subtarget &Subtarget) {

- assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");

- SDValue N0 = N->getOperand(0);

- SDValue N1 = N->getOperand(1);

- EVT VT = N->getValueType(0);

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

- if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||

- !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))

- return SDValue();

- // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)

- bool OptForSize = DAG.shouldOptForSize();

- unsigned Bits = VT.getScalarSizeInBits();

- // SHLD/SHRD instructions have lower register pressure, but on some

- // platforms they have higher latency than the equivalent

- // series of shifts/or that would otherwise be generated.

- // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions

- // have higher latencies and we are not optimizing for size.

- if (!OptForSize && Subtarget.isSHLDSlow())

- return SDValue();

- if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)

- std::swap(N0, N1);

- if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)

- return SDValue();

- if (!N0.hasOneUse() || !N1.hasOneUse())

- return SDValue();

- EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

- SDValue ShAmt0 = N0.getOperand(1);

- if (ShAmt0.getValueType() != ShiftVT)

- return SDValue();

- SDValue ShAmt1 = N1.getOperand(1);

- if (ShAmt1.getValueType() != ShiftVT)

- return SDValue();

- // Peek through any modulo shift masks.

- SDValue ShMsk0;

- if (ShAmt0.getOpcode() == ISD::AND &&

- isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&

- ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {

- ShMsk0 = ShAmt0;

- ShAmt0 = ShAmt0.getOperand(0);

- }

- SDValue ShMsk1;

- if (ShAmt1.getOpcode() == ISD::AND &&

- isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&

- ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {

- ShMsk1 = ShAmt1;

- ShAmt1 = ShAmt1.getOperand(0);

- }

- if (ShAmt0.getOpcode() == ISD::TRUNCATE)

- ShAmt0 = ShAmt0.getOperand(0);

- if (ShAmt1.getOpcode() == ISD::TRUNCATE)

- ShAmt1 = ShAmt1.getOperand(0);

- SDLoc DL(N);

- unsigned Opc = ISD::FSHL;

- SDValue Op0 = N0.getOperand(0);

- SDValue Op1 = N1.getOperand(0);

- if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {

- Opc = ISD::FSHR;

- std::swap(Op0, Op1);

- std::swap(ShAmt0, ShAmt1);

- std::swap(ShMsk0, ShMsk1);

- }

- auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,

- SDValue Amt) {

- if (Opc == ISD::FSHR)

- std::swap(Op0, Op1);

- return DAG.getNode(Opc, DL, VT, Op0, Op1,

- DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));

- };

- // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )

- // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )

- // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )

- // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )

- // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )

- // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )

- if (ShAmt1.getOpcode() == ISD::SUB) {

- SDValue Sum = ShAmt1.getOperand(0);

- if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {

- SDValue ShAmt1Op1 = ShAmt1.getOperand(1);

- if (ShAmt1Op1.getOpcode() == ISD::AND &&

- isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&

- ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {

- ShMsk1 = ShAmt1Op1;

- ShAmt1Op1 = ShAmt1Op1.getOperand(0);

- }

- if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)

- ShAmt1Op1 = ShAmt1Op1.getOperand(0);

- if ((SumC->getAPIntValue() == Bits ||

- (SumC->getAPIntValue() == 0 && ShMsk1)) &&

- ShAmt1Op1 == ShAmt0)

- return GetFunnelShift(Op0, Op1, ShAmt0);

- }

- } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {

- auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);

- if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)

- return GetFunnelShift(Op0, Op1, ShAmt0);

- } else if (ShAmt1.getOpcode() == ISD::XOR) {

- SDValue Mask = ShAmt1.getOperand(1);

- if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {

- unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);

- SDValue ShAmt1Op0 = ShAmt1.getOperand(0);

- if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)

- ShAmt1Op0 = ShAmt1Op0.getOperand(0);

- if (MaskC->getSExtValue() == (Bits - 1) &&

- (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {

- if (Op1.getOpcode() == InnerShift &&

- isa<ConstantSDNode>(Op1.getOperand(1)) &&

- Op1.getConstantOperandAPInt(1).isOneValue()) {

- return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);

- }

- // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).

- if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&

- Op1.getOperand(0) == Op1.getOperand(1)) {

- return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);

- }

- return SDValue();

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

@@ -40771,7 +43239,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

// TODO: Support multiple SrcOps.

if (VT == MVT::i1) {

SmallVector<SDValue, 2> SrcOps;

- if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&

+ SmallVector<APInt, 2> SrcPartials;

+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

SrcOps.size() == 1) {

SDLoc dl(N);

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

@@ -40781,13 +43250,19 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

if (Mask) {

- APInt AllBits = APInt::getNullValue(NumElts);

- return DAG.getSetCC(dl, MVT::i1, Mask,

- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);

+ assert(SrcPartials[0].getBitWidth() == NumElts &&

+ "Unexpected partial reduction mask");

+ SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

+ return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

}

+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

+ return R;

if (DCI.isBeforeLegalizeOps())

return SDValue();

@@ -40803,8 +43278,33 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

return R;

- if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))

- return R;

+ // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

+ // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

+ // iff the upper elements of the non-shifted arg are zero.

+ // KUNPCK require 16+ bool vector elements.

+ if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

+ unsigned NumElts = VT.getVectorNumElements();

+ unsigned HalfElts = NumElts / 2;

+ APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

+ if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

+ N1.getConstantOperandAPInt(1) == HalfElts &&

+ DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {

+ SDLoc dl(N);

+ return DAG.getNode(

+ ISD::CONCAT_VECTORS, dl, VT,

+ extractSubVector(N0, 0, DAG, dl, HalfElts),

+ extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

+ }

+ if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

+ N0.getConstantOperandAPInt(1) == HalfElts &&

+ DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {

+ SDLoc dl(N);

+ return DAG.getNode(

+ ISD::CONCAT_VECTORS, dl, VT,

+ extractSubVector(N1, 0, DAG, dl, HalfElts),

+ extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

+ }

// Attempt to recursively combine an OR of shuffles.

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

@@ -41153,18 +43653,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

// A lambda checking the given SDValue is a constant vector and each element

// is in the range [Min, Max].

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

- BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);

- if (!BV || !BV->isConstant())

- return false;

- for (SDValue Op : V->ops()) {

- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);

- if (!C)

- return false;

- const APInt &Val = C->getAPIntValue();

- if (Val.ult(Min) || Val.ugt(Max))

- return false;

- }

- return true;

+ return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

+ return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

+ });

};

// Check if each element of the vector is right-shifted by one.

@@ -41265,10 +43756,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

ISD::LoadExtType Ext = Ld->getExtensionType();

bool Fast;

- unsigned Alignment = Ld->getAlignment();

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

Ext == ISD::NON_EXTLOAD &&

- ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||

+ ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

+ Ld->getAlignment() >= 16) ||

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

*Ld->getMemOperand(), &Fast) &&

!Fast))) {

@@ -41276,17 +43767,18 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

if (NumElems < 2)

return SDValue();

- unsigned HalfAlign = 16;

+ unsigned HalfOffset = 16;

SDValue Ptr1 = Ld->getBasePtr();

- SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);

+ SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

NumElems / 2);

SDValue Load1 =

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

- Alignment, Ld->getMemOperand()->getFlags());

+ Ld->getOriginalAlign(),

+ Ld->getMemOperand()->getFlags());

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

- Ld->getPointerInfo().getWithOffset(HalfAlign),

- MinAlign(Alignment, HalfAlign),

+ Ld->getPointerInfo().getWithOffset(HalfOffset),

+ Ld->getOriginalAlign(),

Ld->getMemOperand()->getFlags());

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

Load1.getValue(1), Load2.getValue(1));

@@ -41303,13 +43795,28 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

if (TLI.isTypeLegal(IntVT)) {

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

- Ld->getPointerInfo(), Alignment,

+ Ld->getPointerInfo(),

+ Ld->getOriginalAlign(),

Ld->getMemOperand()->getFlags());

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

}

+ // Cast ptr32 and ptr64 pointers to the default address space before a load.

+ unsigned AddrSpace = Ld->getAddressSpace();

+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

+ AddrSpace == X86AS::PTR32_UPTR) {

+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

+ if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

+ SDValue Cast =

+ DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

+ return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

+ Ld->getOriginalAlign(),

+ Ld->getMemOperand()->getFlags());

+ }

return SDValue();

}

@@ -41456,7 +43963,7 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

- MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

+ auto *Mld = cast<MaskedLoadSDNode>(N);

// TODO: Expanding load with constant mask may be optimized as well.

if (Mld->isExpandingLoad())

@@ -41465,12 +43972,33 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))

return ScalarLoad;

// TODO: Do some AVX512 subsets benefit from this transform?

if (!Subtarget.hasAVX512())

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

return Blend;

}

+ // If the mask value has been legalized to a non-boolean vector, try to

+ // simplify ops leading up to it. We only demand the MSB of each lane.

+ SDValue Mask = Mld->getMask();

+ if (Mask.getScalarValueSizeInBits() != 1) {

+ EVT VT = Mld->getValueType(0);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

+ return SDValue(N, 0);

+ }

+ if (SDValue NewMask =

+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

+ return DAG.getMaskedLoad(

+ VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

+ NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

+ Mld->getAddressingMode(), Mld->getExtensionType());

+ }

return SDValue();

}

@@ -41522,9 +44050,18 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

// simplify ops leading up to it. We only demand the MSB of each lane.

SDValue Mask = Mst->getMask();

if (Mask.getScalarValueSizeInBits() != 1) {

- APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));

- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))

+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

return SDValue(N, 0);

+ }

+ if (SDValue NewMask =

+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

+ Mst->getBasePtr(), Mst->getOffset(), NewMask,

+ Mst->getMemoryVT(), Mst->getMemOperand(),

+ Mst->getAddressingMode());

}

SDValue Value = Mst->getValue();

@@ -41546,7 +44083,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

StoreSDNode *St = cast<StoreSDNode>(N);

EVT StVT = St->getMemoryVT();

SDLoc dl(St);

- unsigned Alignment = St->getAlignment();

SDValue StoredVal = St->getValue();

EVT VT = StoredVal.getValueType();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

@@ -41559,7 +44095,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

StoredVal = DAG.getBitcast(NewVT, StoredVal);

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

}

@@ -41570,7 +44106,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

StoredVal.getOperand(0).getValueType() == MVT::i8) {

return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),

St->getBasePtr(), St->getPointerInfo(),

- St->getAlignment(), St->getMemOperand()->getFlags());

+ St->getOriginalAlign(),

+ St->getMemOperand()->getFlags());

}

// Widen v2i1/v4i1 stores to v8i1.

@@ -41581,7 +44118,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

Ops[0] = StoredVal;

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

}

@@ -41590,7 +44127,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

// If its a v64i1 store without 64-bit support, we need two stores.

- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

+ if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

StoredVal->ops().slice(0, 32));

Lo = combinevXi1ConstantToInteger(Lo, DAG);

@@ -41603,18 +44140,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

SDValue Ch0 =

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

- Alignment, St->getMemOperand()->getFlags());

+ St->getOriginalAlign(),

+ St->getMemOperand()->getFlags());

SDValue Ch1 =

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

St->getPointerInfo().getWithOffset(4),

- MinAlign(Alignment, 4U),

+ St->getOriginalAlign(),

St->getMemOperand()->getFlags());

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

}

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

}

@@ -41633,7 +44171,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

}

// Split under-aligned vector non-temporal stores.

- if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {

+ if (St->isNonTemporal() && StVT == VT &&

+ St->getAlignment() < VT.getStoreSize()) {

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

// vectors or the legalizer can scalarize it to use MOVNTI.

if (VT.is256BitVector() || VT.is512BitVector()) {

@@ -41687,7 +44226,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

Subtarget, dl))

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

if (TLI.isTruncStoreLegal(VT, StVT)) {

@@ -41705,6 +44244,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+ // Cast ptr32 and ptr64 pointers to the default address space before a store.

+ unsigned AddrSpace = St->getAddressSpace();

+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

+ AddrSpace == X86AS::PTR32_UPTR) {

+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

+ if (PtrVT != St->getBasePtr().getSimpleValueType()) {

+ SDValue Cast =

+ DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

+ return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

+ St->getPointerInfo(), St->getOriginalAlign(),

+ St->getMemOperand()->getFlags(), St->getAAInfo());

+ }

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

// the FP state in cases where an emms may be missing.

// A preferable solution to the general problem is to figure out the right

@@ -41759,13 +44312,38 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

BitCast, OldExtract.getOperand(1));

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

- St->getPointerInfo(), St->getAlignment(),

+ St->getPointerInfo(), St->getOriginalAlign(),

St->getMemOperand()->getFlags());

}

return SDValue();

}

+static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI,

+ const X86Subtarget &Subtarget) {

+ auto *St = cast<MemIntrinsicSDNode>(N);

+ SDValue StoredVal = N->getOperand(1);

+ MVT VT = StoredVal.getSimpleValueType();

+ EVT MemVT = St->getMemoryVT();

+ // Figure out which elements we demand.

+ unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

+ APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

+ APInt KnownUndef, KnownZero;

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,

+ KnownZero, DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

+ return SDValue(N, 0);

+ }

+ return SDValue();

/// Return 'true' if this vector operation is "horizontal"

/// and return the operands for the horizontal operation in LHS and RHS. A

/// horizontal operation performs the binary operation on successive elements

@@ -42002,17 +44580,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

// of one truncation.

// i.e. if one of the inputs will constant fold or the input is repeated.

switch (SrcOpcode) {

- case ISD::AND:

- case ISD::XOR:

- case ISD::OR: {

- SDValue Op0 = Src.getOperand(0);

- SDValue Op1 = Src.getOperand(1);

- if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&

- (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

- return TruncateArithmetic(Op0, Op1);

- break;

- }

case ISD::MUL:

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

// better to truncate if we have the chance.

@@ -42021,21 +44588,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

!TLI.isOperationLegal(SrcOpcode, SrcVT))

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

LLVM_FALLTHROUGH;

- case ISD::ADD: {

- SDValue Op0 = Src.getOperand(0);

- SDValue Op1 = Src.getOperand(1);

- if (TLI.isOperationLegal(SrcOpcode, VT) &&

- (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

- return TruncateArithmetic(Op0, Op1);

- break;

- }

+ case ISD::AND:

+ case ISD::XOR:

+ case ISD::OR:

+ case ISD::ADD:

case ISD::SUB: {

- // TODO: ISD::SUB We are conservative and require both sides to be freely

- // truncatable to avoid interfering with combineSubToSubus.

SDValue Op0 = Src.getOperand(0);

SDValue Op1 = Src.getOperand(1);

if (TLI.isOperationLegal(SrcOpcode, VT) &&

- (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))

+ (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

return TruncateArithmetic(Op0, Op1);

break;

}

@@ -42146,13 +44707,17 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

MVT InSVT = InVT.getScalarType();

// Check we have a truncation suited for PACKSS/PACKUS.

- if (!VT.is128BitVector() && !VT.is256BitVector())

+ if (!isPowerOf2_32(VT.getVectorNumElements()))

return SDValue();

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

return SDValue();

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

return SDValue();

+ // Truncation to sub-128bit vXi32 can be better handled with shuffles.

+ if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

+ return SDValue();

// AVX512 has fast truncate, but if the input is already going to be split,

// there's no harm in trying pack.

if (Subtarget.hasAVX512() &&

@@ -42173,6 +44738,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

// Use PACKSS if the input has sign-bits that extend all the way to the

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

+ // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

+ // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

+ // on and combines/simplifications can't then use it.

+ if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

+ return SDValue();

if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

@@ -42201,9 +44773,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

return SDValue();

- // Input type should be vXi32.

+ // Input type should be at least vXi32.

EVT InVT = Src.getValueType();

- if (InVT.getVectorElementType() != MVT::i32)

+ if (InVT.getVectorElementType().getSizeInBits() < 32)

return SDValue();

// Need a shift by 16.

@@ -42412,7 +44984,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

return combineVectorTruncation(N, DAG, Subtarget);

}

-static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {

+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI) {

EVT VT = N->getValueType(0);

SDValue In = N->getOperand(0);

SDLoc DL(N);

@@ -42422,6 +44995,11 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {

if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));

+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

+ return SDValue(N, 0);

return SDValue();

}

@@ -42514,37 +45092,46 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

if (NegMul) {

switch (Opcode) {

default: llvm_unreachable("Unexpected opcode");

- case ISD::FMA: Opcode = X86ISD::FNMADD; break;

- case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

- case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

- case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

- case X86ISD::FNMADD: Opcode = ISD::FMA; break;

- case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

- case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;

+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

+ case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;

+ case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

}

if (NegAcc) {

switch (Opcode) {

default: llvm_unreachable("Unexpected opcode");

- case ISD::FMA: Opcode = X86ISD::FMSUB; break;

- case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

- case X86ISD::FMSUB: Opcode = ISD::FMA; break;

- case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

- case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

- case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

- case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

- case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

- case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

- case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

- case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;

+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;

+ case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

+ case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

}

if (NegRes) {

switch (Opcode) {

+ // For accuracy reason, we never combine fneg and fma under strict FP.

default: llvm_unreachable("Unexpected opcode");

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

@@ -42562,18 +45149,20 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

/// Do target-specific dag combines on floating point negations.

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

EVT OrigVT = N->getValueType(0);

SDValue Arg = isFNEG(DAG, N);

if (!Arg)

return SDValue();

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

EVT VT = Arg.getValueType();

EVT SVT = VT.getScalarType();

SDLoc DL(N);

// Let legalize expand this if it isn't a legal type yet.

- if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

+ if (!TLI.isTypeLegal(VT))

return SDValue();

// If we're negating a FMUL node on a target with FMA, then we can avoid the

@@ -42587,80 +45176,25 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

return DAG.getBitcast(OrigVT, NewNode);

}

- // If we're negating an FMA node, then we can adjust the

- // instruction to include the extra negation.

- if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {

- switch (Arg.getOpcode()) {

- case ISD::FMA:

- case X86ISD::FMSUB:

- case X86ISD::FNMADD:

- case X86ISD::FNMSUB:

- case X86ISD::FMADD_RND:

- case X86ISD::FMSUB_RND:

- case X86ISD::FNMADD_RND:

- case X86ISD::FNMSUB_RND: {

- // We can't handle scalar intrinsic node here because it would only

- // invert one element and not the whole vector. But we could try to handle

- // a negation of the lower element only.

- unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);

- return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));

- }

+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool LegalOperations = !DCI.isBeforeLegalizeOps();

+ if (SDValue NegArg =

+ TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

+ return DAG.getBitcast(OrigVT, NegArg);

return SDValue();

}

-char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,

- bool LegalOperations,

- bool ForCodeSize,

- unsigned Depth) const {

- // fneg patterns are removable even if they have multiple uses.

- if (isFNEG(DAG, Op.getNode(), Depth))

- return 2;

- // Don't recurse exponentially.

- if (Depth > SelectionDAG::MaxRecursionDepth)

- return 0;

- EVT VT = Op.getValueType();

- EVT SVT = VT.getScalarType();

- switch (Op.getOpcode()) {

- case ISD::FMA:

- case X86ISD::FMSUB:

- case X86ISD::FNMADD:

- case X86ISD::FNMSUB:

- case X86ISD::FMADD_RND:

- case X86ISD::FMSUB_RND:

- case X86ISD::FNMADD_RND:

- case X86ISD::FNMSUB_RND: {

- if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

- !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)

- break;

- // This is always negatible for free but we might be able to remove some

- // extra operand negations as well.

- for (int i = 0; i != 3; ++i) {

- char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,

- ForCodeSize, Depth + 1);

- if (V == 2)

- return V;

- }

- return 1;

- }

- return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,

- ForCodeSize, Depth);

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

bool LegalOperations,

bool ForCodeSize,

+ NegatibleCost &Cost,

unsigned Depth) const {

// fneg patterns are removable even if they have multiple uses.

- if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))

+ if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

+ Cost = NegatibleCost::Cheaper;

return DAG.getBitcast(Op.getValueType(), Arg);

+ }

EVT VT = Op.getValueType();

EVT SVT = VT.getScalarType();

@@ -42675,35 +45209,41 @@ SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

case X86ISD::FNMADD_RND:

case X86ISD::FNMSUB_RND: {

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

- !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)

+ !(SVT == MVT::f32 || SVT == MVT::f64) ||

+ !isOperationLegal(ISD::FMA, VT))

break;

// This is always negatible for free but we might be able to remove some

// extra operand negations as well.

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

- for (int i = 0; i != 3; ++i) {

- char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,

- ForCodeSize, Depth + 1);

- if (V == 2)

- NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,

- ForCodeSize, Depth + 1);

- }

+ for (int i = 0; i != 3; ++i)

+ NewOps[i] = getCheaperNegatedExpression(

+ Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

bool NegA = !!NewOps[0];

bool NegB = !!NewOps[1];

bool NegC = !!NewOps[2];

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

+ Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

+ : NegatibleCost::Neutral;

// Fill in the non-negated ops with the original values.

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

if (!NewOps[i])

NewOps[i] = Op.getOperand(i);

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

}

+ case X86ISD::FRCP:

+ if (SDValue NegOp0 =

+ getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

+ ForCodeSize, Cost, Depth + 1))

+ return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

+ break;

}

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

- ForCodeSize, Depth);

+ ForCodeSize, Cost, Depth);

}

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

@@ -42764,6 +45304,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

return Cmp;

+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

+ return R;

if (DCI.isBeforeLegalizeOps())

return SDValue();

@@ -42776,33 +45319,21 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

return FPLogic;

- return combineFneg(N, DAG, Subtarget);

+ return combineFneg(N, DAG, DCI, Subtarget);

}

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

- SDValue Op0 = N->getOperand(0);

- SDValue Op1 = N->getOperand(1);

EVT VT = N->getValueType(0);

unsigned NumBits = VT.getSizeInBits();

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

// TODO - Constant Folding.

- if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

- // Reduce Cst1 to the bottom 16-bits.

- // NOTE: SimplifyDemandedBits won't do this for constants.

- const APInt &Val1 = Cst1->getAPIntValue();

- APInt MaskedVal1 = Val1 & 0xFFFF;

- if (MaskedVal1 != Val1)

- return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,

- DAG.getConstant(MaskedVal1, SDLoc(N), VT));

- }

- // Only bottom 16-bits of the control bits are required.

- APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));

- if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))

+ // Simplify the inputs.

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));

+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

return SDValue(N, 0);

return SDValue();

@@ -42893,6 +45424,7 @@ static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);

@@ -42904,7 +45436,7 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

return N->getOperand(0);

- if (SDValue NewVal = combineFneg(N, DAG, Subtarget))

+ if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

return NewVal;

return lowerX86FPLogicOp(N, DAG, Subtarget);

@@ -43015,23 +45547,16 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

assert(InVT.is128BitVector() && "Expected 128-bit input vector");

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

- // Unless the load is volatile or atomic.

- if (LN->isSimple()) {

+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

+ MVT MemVT = MVT::getIntegerVT(NumBits);

+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

SDLoc dl(N);

- unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

- MVT MemVT = MVT::getIntegerVT(NumBits);

- MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

- SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

- SDValue VZLoad =

- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,

- LN->getPointerInfo(),

- LN->getAlignment(),

- LN->getMemOperand()->getFlags());

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

DAG.getBitcast(InVT, VZLoad));

DCI.CombineTo(N, Convert);

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

return SDValue(N, 0);

}

@@ -43041,33 +45566,33 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

- // FIXME: Handle strict fp nodes.

+ bool IsStrict = N->isTargetStrictFPOpcode();

EVT VT = N->getValueType(0);

// Convert a full vector load into vzload when not all bits are needed.

- SDValue In = N->getOperand(0);

+ SDValue In = N->getOperand(IsStrict ? 1 : 0);

MVT InVT = In.getSimpleValueType();

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

assert(InVT.is128BitVector() && "Expected 128-bit input vector");

LoadSDNode *LN = cast<LoadSDNode>(In);

- // Unless the load is volatile or atomic.

- if (LN->isSimple()) {

+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

+ MVT MemVT = MVT::getFloatingPointVT(NumBits);

+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

SDLoc dl(N);

- unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

- MVT MemVT = MVT::getFloatingPointVT(NumBits);

- MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

- SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

- SDValue VZLoad =

- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,

- LN->getPointerInfo(),

- LN->getAlignment(),

- LN->getMemOperand()->getFlags());

- SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

- DAG.getBitcast(InVT, VZLoad));

- DCI.CombineTo(N, Convert);

+ if (IsStrict) {

+ SDValue Convert =

+ DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

+ {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

+ DCI.CombineTo(N, Convert, Convert.getValue(1));

+ } else {

+ SDValue Convert =

+ DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

+ DCI.CombineTo(N, Convert);

+ }

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

return SDValue(N, 0);

}

@@ -43106,14 +45631,58 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

- SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

// BT ignores high bits in the bit index operand.

unsigned BitWidth = N1.getValueSizeInBits();

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

- if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))

- return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

+ if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

+ return SDValue(N, 0);

+ }

+ return SDValue();

+static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI) {

+ bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);

+ if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

+ APInt KnownUndef, KnownZero;

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ APInt DemandedElts = APInt::getLowBitsSet(8, 4);

+ if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

+ DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

+ return SDValue(N, 0);

+ }

+ // Convert a full vector load into vzload when not all bits are needed.

+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

+ SDLoc dl(N);

+ if (IsStrict) {

+ SDValue Convert = DAG.getNode(

+ N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

+ {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

+ DCI.CombineTo(N, Convert, Convert.getValue(1));

+ } else {

+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

+ DAG.getBitcast(MVT::v8i16, VZLoad));

+ DCI.CombineTo(N, Convert);

+ }

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

+ DCI.recursivelyDeleteUnusedNodes(LN);

+ return SDValue(N, 0);

+ }

return SDValue();

}

@@ -43199,7 +45768,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

- N0.getOpcode() == ISD::SIGN_EXTEND)) {

+ N0.getOpcode() == ISD::SIGN_EXTEND)) {

SDValue N00 = N0.getOperand(0);

// EXTLOAD has a better solution on AVX2,

@@ -43208,9 +45777,14 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

if (!ISD::isNormalLoad(N00.getNode()))

return SDValue();

+ // Attempt to promote any comparison mask ops before moving the

+ // SIGN_EXTEND_INREG in the way.

+ if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

- SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,

- N00, N1);

+ SDValue Tmp =

+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

}

@@ -43395,6 +45969,21 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,

for (unsigned i = 0; i != Scale; ++i)

ShuffleMask.append(EltSizeInBits, i);

+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

+ } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

+ (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

+ // If we have register broadcast instructions, use the scalar size as the

+ // element type for the shuffle. Then cast to the wider element type. The

+ // widened bits won't be used, and this might allow the use of a broadcast

+ // load.

+ assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");

+ unsigned Scale = EltSizeInBits / NumElts;

+ EVT BroadcastVT =

+ EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

+ ShuffleMask.append(NumElts * Scale, 0);

+ Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

+ Vec = DAG.getBitcast(VT, Vec);

} else {

// For smaller scalar integers, we can simply any-extend it to the vector

// element size (we don't care about the upper bits) and broadcast it to all

@@ -43402,8 +45991,8 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

ShuffleMask.append(NumElts, 0);

+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

}

- Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

// Now, mask the relevant bit in each element.

SmallVector<SDValue, 32> Bits;

@@ -43448,7 +46037,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

// We can only do this if the vector size in 256 bits or less.

unsigned Size = VT.getSizeInBits();

- if (Size > 256)

+ if (Size > 256 && Subtarget.useAVX512Regs())

return SDValue();

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

@@ -43466,7 +46055,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

if (N->getOpcode() == ISD::ZERO_EXTEND)

- Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());

+ Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

return Res;

}

@@ -43479,6 +46068,23 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

EVT InVT = N0.getValueType();

SDLoc DL(N);

+ // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

+ if (!DCI.isBeforeLegalizeOps() &&

+ N0.getOpcode() == X86ISD::SETCC_CARRY) {

+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

+ N0->getOperand(1));

+ bool ReplaceOtherUses = !N0.hasOneUse();

+ DCI.CombineTo(N, Setcc);

+ // Replace other uses with a truncate of the widened setcc_carry.

+ if (ReplaceOtherUses) {

+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

+ N0.getValueType(), Setcc);

+ DCI.CombineTo(N0.getNode(), Trunc);

+ }

+ return SDValue(N, 0);

+ }

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

return NewCMov;

@@ -43516,6 +46122,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

const X86Subtarget &Subtarget) {

SDLoc dl(N);

EVT VT = N->getValueType(0);

+ bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

// Let legalize expand this if it isn't a legal type yet.

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

@@ -43526,15 +46133,16 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())

return SDValue();

- SDValue A = N->getOperand(0);

- SDValue B = N->getOperand(1);

- SDValue C = N->getOperand(2);

+ SDValue A = N->getOperand(IsStrict ? 1 : 0);

+ SDValue B = N->getOperand(IsStrict ? 2 : 1);

+ SDValue C = N->getOperand(IsStrict ? 3 : 2);

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

bool LegalOperations = !DCI.isBeforeLegalizeOps();

- if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {

- V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);

+ if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

+ CodeSize)) {

+ V = NegV;

return true;

}

// Look through extract_vector_elts. If it comes from an FNEG, create a

@@ -43542,11 +46150,10 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

isNullConstant(V.getOperand(1))) {

SDValue Vec = V.getOperand(0);

- if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {

- SDValue NegVal =

- TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);

+ if (SDValue NegV = TLI.getCheaperNegatedExpression(

+ Vec, DAG, LegalOperations, CodeSize)) {

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

- NegVal, V.getOperand(1));

+ NegV, V.getOperand(1));

return true;

}

@@ -43566,9 +46173,15 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

unsigned NewOpcode =

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

- if (N->getNumOperands() == 4)

- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

- return DAG.getNode(NewOpcode, dl, VT, A, B, C);

+ if (IsStrict) {

+ assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");

+ return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

+ {N->getOperand(0), A, B, C});

+ } else {

+ if (N->getNumOperands() == 4)

+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);

+ }

}

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

@@ -43582,10 +46195,11 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

bool LegalOperations = !DCI.isBeforeLegalizeOps();

SDValue N2 = N->getOperand(2);

- if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)

- return SDValue();

- SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);

+ SDValue NegN2 =

+ TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

+ if (!NegN2)

+ return SDValue();

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

if (N->getNumOperands() == 4)

@@ -43598,38 +46212,26 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

- // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->

- // (and (i32 x86isd::setcc_carry), 1)

- // This eliminates the zext. This transformation is necessary because

- // ISD::SETCC is always legalized to i8.

SDLoc dl(N);

SDValue N0 = N->getOperand(0);

EVT VT = N->getValueType(0);

- if (N0.getOpcode() == ISD::AND &&

- N0.hasOneUse() &&

- N0.getOperand(0).hasOneUse()) {

- SDValue N00 = N0.getOperand(0);

- if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

- if (!isOneConstant(N0.getOperand(1)))

- return SDValue();

- return DAG.getNode(ISD::AND, dl, VT,

- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,

- N00.getOperand(0), N00.getOperand(1)),

- DAG.getConstant(1, dl, VT));

+ // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

+ // FIXME: Is this needed? We don't seem to have any tests for it.

+ if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

+ N0.getOpcode() == X86ISD::SETCC_CARRY) {

+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

+ N0->getOperand(1));

+ bool ReplaceOtherUses = !N0.hasOneUse();

+ DCI.CombineTo(N, Setcc);

+ // Replace other uses with a truncate of the widened setcc_carry.

+ if (ReplaceOtherUses) {

+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

+ N0.getValueType(), Setcc);

+ DCI.CombineTo(N0.getNode(), Trunc);

}

- }

- if (N0.getOpcode() == ISD::TRUNCATE &&

- N0.hasOneUse() &&

- N0.getOperand(0).hasOneUse()) {

- SDValue N00 = N0.getOperand(0);

- if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

- return DAG.getNode(ISD::AND, dl, VT,

- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,

- N00.getOperand(0), N00.getOperand(1)),

- DAG.getConstant(1, dl, VT));

- }

+ return SDValue(N, 0);

}

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

@@ -43742,13 +46344,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

EVT VT = SetCC->getValueType(0);

SDLoc DL(SetCC);

- bool HasAVX = Subtarget.hasAVX();

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

// Otherwise use PCMPEQ (plus AND) and mask testing.

if ((OpSize == 128 && Subtarget.hasSSE2()) ||

- (OpSize == 256 && HasAVX) ||

+ (OpSize == 256 && Subtarget.hasAVX()) ||

(OpSize == 512 && Subtarget.useAVX512Regs())) {

bool HasPT = Subtarget.hasSSE41();

@@ -43802,11 +46403,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

X = DAG.getBitcast(TmpCastVT, X);

if (!NeedZExt && !TmpZext)

return X;

- const TargetLowering &TLI = DAG.getTargetLoweringInfo();

- MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

DAG.getConstant(0, DL, VecVT), X,

- DAG.getConstant(0, DL, VecIdxVT));

+ DAG.getVectorIdxConstant(0, DL));

};

SDValue Cmp;

@@ -43839,17 +46438,16 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

Cmp);

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

- SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);

- return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));

+ SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

+ return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

}

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

- // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq

- // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne

+ assert(Cmp.getValueType() == MVT::v16i8 &&

+ "Non 128-bit vector on pre-SSE41 target");

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

- SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,

- MVT::i32);

+ SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

}

@@ -43866,23 +46464,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

SDLoc DL(N);

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

- // 0-x == y --> x+y == 0

- // 0-x != y --> x+y != 0

- if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&

- LHS.hasOneUse()) {

- SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));

- return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);

- }

- // x == 0-y --> x+y == 0

- // x != 0-y --> x+y != 0

- if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&

- RHS.hasOneUse()) {

- SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));

- return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);

- }

if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))

return V;

+ if (VT == MVT::i1 && isNullConstant(RHS)) {

+ SDValue X86CC;

+ if (SDValue V =

+ MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))

+ return DAG.getNode(ISD::TRUNCATE, DL, VT,

+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));

+ }

}

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

@@ -43905,7 +46496,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

if (IsSEXT0 && IsVZero1) {

assert(VT == Op0.getOperand(0).getValueType() &&

- "Uexpected operand type");

+ "Unexpected operand type");

if (TmpCC == ISD::SETGT)

return DAG.getConstant(0, DL, VT);

if (TmpCC == ISD::SETLE)

@@ -43995,20 +46586,43 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

if (Mask.getScalarValueSizeInBits() != 1) {

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))

+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

return SDValue(N, 0);

+ }

}

return SDValue();

}

+static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

+ SDValue Index, SDValue Base, SDValue Scale,

+ SelectionDAG &DAG) {

+ SDLoc DL(GorS);

+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

+ SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

+ Gather->getMask(), Base, Index, Scale } ;

+ return DAG.getMaskedGather(Gather->getVTList(),

+ Gather->getMemoryVT(), DL, Ops,

+ Gather->getMemOperand(),

+ Gather->getIndexType());

+ }

+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);

+ SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

+ Scatter->getMask(), Base, Index, Scale };

+ return DAG.getMaskedScatter(Scatter->getVTList(),

+ Scatter->getMemoryVT(), DL,

+ Ops, Scatter->getMemOperand(),

+ Scatter->getIndexType());

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI) {

SDLoc DL(N);

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

- SDValue Chain = GorS->getChain();

SDValue Index = GorS->getIndex();

- SDValue Mask = GorS->getMask();

SDValue Base = GorS->getBasePtr();

SDValue Scale = GorS->getScale();

@@ -44028,21 +46642,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

unsigned NumElts = Index.getValueType().getVectorNumElements();

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

- SDValue Ops[] = { Chain, Gather->getPassThru(),

- Mask, Base, Index, Scale } ;

- return DAG.getMaskedGather(Gather->getVTList(),

- Gather->getMemoryVT(), DL, Ops,

- Gather->getMemOperand(),

- Gather->getIndexType());

- }

- auto *Scatter = cast<MaskedScatterSDNode>(GorS);

- SDValue Ops[] = { Chain, Scatter->getValue(),

- Mask, Base, Index, Scale };

- return DAG.getMaskedScatter(Scatter->getVTList(),

- Scatter->getMemoryVT(), DL,

- Ops, Scatter->getMemOperand(),

- Scatter->getIndexType());

+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

}

@@ -44057,21 +46657,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

unsigned NumElts = Index.getValueType().getVectorNumElements();

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

- SDValue Ops[] = { Chain, Gather->getPassThru(),

- Mask, Base, Index, Scale } ;

- return DAG.getMaskedGather(Gather->getVTList(),

- Gather->getMemoryVT(), DL, Ops,

- Gather->getMemOperand(),

- Gather->getIndexType());

- }

- auto *Scatter = cast<MaskedScatterSDNode>(GorS);

- SDValue Ops[] = { Chain, Scatter->getValue(),

- Mask, Base, Index, Scale };

- return DAG.getMaskedScatter(Scatter->getVTList(),

- Scatter->getMemoryVT(), DL,

- Ops, Scatter->getMemOperand(),

- Scatter->getIndexType());

+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

}

@@ -44084,30 +46670,20 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,

Index.getValueType().getVectorNumElements());

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

- SDValue Ops[] = { Chain, Gather->getPassThru(),

- Mask, Base, Index, Scale } ;

- return DAG.getMaskedGather(Gather->getVTList(),

- Gather->getMemoryVT(), DL, Ops,

- Gather->getMemOperand(),

- Gather->getIndexType());

- }

- auto *Scatter = cast<MaskedScatterSDNode>(GorS);

- SDValue Ops[] = { Chain, Scatter->getValue(),

- Mask, Base, Index, Scale };

- return DAG.getMaskedScatter(Scatter->getVTList(),

- Scatter->getMemoryVT(), DL,

- Ops, Scatter->getMemOperand(),

- Scatter->getIndexType());

+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

}

// With vector masks we only demand the upper bit of the mask.

+ SDValue Mask = GorS->getMask();

if (Mask.getScalarValueSizeInBits() != 1) {

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))

+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ DCI.AddToWorklist(N);

return SDValue(N, 0);

+ }

}

return SDValue();

@@ -44146,10 +46722,11 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+// TODO: Could we move this to DAGCombine?

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

SelectionDAG &DAG) {

- // Take advantage of vector comparisons producing 0 or -1 in each lane to

- // optimize away operation when it's from a constant.

+ // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

+ // to optimize away operation when it's from a constant.

// The general transformation is:

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

@@ -44161,9 +46738,10 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

// aren't the same.

EVT VT = N->getValueType(0);

bool IsStrict = N->isStrictFPOpcode();

+ unsigned NumEltBits = VT.getScalarSizeInBits();

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

- if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||

- Op0->getOperand(0)->getOpcode() != ISD::SETCC ||

+ if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

+ DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

VT.getSizeInBits() != Op0.getValueSizeInBits())

return SDValue();

@@ -44336,7 +46914,6 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

Op0.getOpcode() == ISD::LOAD) {

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

- EVT LdVT = Ld->getValueType(0);

// This transformation is not supported if the result type is f16 or f128.

if (VT == MVT::f16 || VT == MVT::f128)

@@ -44347,11 +46924,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

if (Subtarget.hasDQI() && VT != MVT::f80)

return SDValue();

- if (Ld->isSimple() && !VT.isVector() &&

- ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&

- !Subtarget.is64Bit() && LdVT == MVT::i64) {

- std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(

- SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);

+ if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

+ Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

+ std::pair<SDValue, SDValue> Tmp =

+ Subtarget.getTargetLowering()->BuildFILD(

+ VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

+ Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

return Tmp.first;

}

@@ -44685,7 +47263,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

}

if (CC == X86::COND_A) {

- SDValue EFLAGS = Y->getOperand(1);

+ SDValue EFLAGS = Y.getOperand(1);

// Try to convert COND_A into COND_B in an attempt to facilitate

// materializing "setb reg".

@@ -44698,13 +47276,44 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),

EFLAGS.getNode()->getVTList(),

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

- SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

DAG.getVTList(VT, MVT::i32), X,

DAG.getConstant(0, DL, VT), NewEFLAGS);

}

+ if (CC == X86::COND_AE) {

+ // X + SETAE --> sbb X, -1

+ // X - SETAE --> adc X, -1

+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

+ DAG.getVTList(VT, MVT::i32), X,

+ DAG.getConstant(-1, DL, VT), Y.getOperand(1));

+ }

+ if (CC == X86::COND_BE) {

+ // X + SETBE --> sbb X, -1

+ // X - SETBE --> adc X, -1

+ SDValue EFLAGS = Y.getOperand(1);

+ // Try to convert COND_BE into COND_AE in an attempt to facilitate

+ // materializing "setae reg".

+ //

+ // Do not flip "e <= c", where "c" is a constant, because Cmp instruction

+ // cannot take an immediate as its first operand.

+ //

+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

+ EFLAGS.getValueType().isInteger() &&

+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

+ SDValue NewSub = DAG.getNode(

+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));

+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

+ DAG.getVTList(VT, MVT::i32), X,

+ DAG.getConstant(-1, DL, VT), NewEFLAGS);

+ }

if (CC != X86::COND_E && CC != X86::COND_NE)

return SDValue();

@@ -44741,15 +47350,18 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {

SDValue One = DAG.getConstant(1, DL, ZVT);

- SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);

+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

+ Cmp1.getValue(1));

}

// (cmp Z, 1) sets the carry flag if Z is 0.

SDValue One = DAG.getConstant(1, DL, ZVT);

- SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

// Add the flags type for ADC/SBB nodes.

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

@@ -44758,151 +47370,12 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

if (CC == X86::COND_NE)

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

- DAG.getConstant(-1ULL, DL, VT), Cmp1);

+ DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

- DAG.getConstant(0, DL, VT), Cmp1);

-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,

- const X86Subtarget &Subtarget) {

- if (!Subtarget.hasSSE2())

- return SDValue();

- EVT VT = N->getValueType(0);

- // If the vector size is less than 128, or greater than the supported RegSize,

- // do not use PMADD.

- if (!VT.isVector() || VT.getVectorNumElements() < 8)

- return SDValue();

- SDValue Op0 = N->getOperand(0);

- SDValue Op1 = N->getOperand(1);

- auto UsePMADDWD = [&](SDValue Op) {

- ShrinkMode Mode;

- return Op.getOpcode() == ISD::MUL &&

- canReduceVMulWidth(Op.getNode(), DAG, Mode) &&

- Mode != ShrinkMode::MULU16 &&

- (!Subtarget.hasSSE41() ||

- (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&

- Op->isOnlyUserOf(Op.getOperand(1).getNode())));

- };

- SDValue MulOp, OtherOp;

- if (UsePMADDWD(Op0)) {

- MulOp = Op0;

- OtherOp = Op1;

- } else if (UsePMADDWD(Op1)) {

- MulOp = Op1;

- OtherOp = Op0;

- } else

- return SDValue();

- SDLoc DL(N);

- EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

- VT.getVectorNumElements());

- EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

- VT.getVectorNumElements() / 2);

- // Shrink the operands of mul.

- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));

- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

- // Madd vector size is half of the original vector size

- auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

- ArrayRef<SDValue> Ops) {

- MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

- return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);

- };

- SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },

- PMADDWDBuilder);

- // Fill the rest of the output with 0

- SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());

- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);

- // Preserve the reduction flag on the ADD. We may need to revisit for the

- // other operand.

- SDNodeFlags Flags;

- Flags.setVectorReduction(true);

- return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);

-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,

- const X86Subtarget &Subtarget) {

- if (!Subtarget.hasSSE2())

- return SDValue();

- SDLoc DL(N);

- EVT VT = N->getValueType(0);

- // TODO: There's nothing special about i32, any integer type above i16 should

- // work just as well.

- if (!VT.isVector() || !VT.isSimple() ||

- !(VT.getVectorElementType() == MVT::i32))

- return SDValue();

- unsigned RegSize = 128;

- if (Subtarget.useBWIRegs())

- RegSize = 512;

- else if (Subtarget.hasAVX())

- RegSize = 256;

- // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.

- // TODO: We should be able to handle larger vectors by splitting them before

- // feeding them into several SADs, and then reducing over those.

- if (VT.getSizeInBits() / 4 > RegSize)

- return SDValue();

- // We know N is a reduction add. To match SAD, we need one of the operands to

- // be an ABS.

- SDValue AbsOp = N->getOperand(0);

- SDValue OtherOp = N->getOperand(1);

- if (AbsOp.getOpcode() != ISD::ABS)

- std::swap(AbsOp, OtherOp);

- if (AbsOp.getOpcode() != ISD::ABS)

- return SDValue();

- // Check whether we have an abs-diff pattern feeding into the select.

- SDValue SadOp0, SadOp1;

- if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))

- return SDValue();

- // SAD pattern detected. Now build a SAD instruction and an addition for

- // reduction. Note that the number of elements of the result of SAD is less

- // than the number of elements of its input. Therefore, we could only update

- // part of elements in the reduction vector.

- SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);

- // The output of PSADBW is a vector of i64.

- // We need to turn the vector of i64 into a vector of i32.

- // If the reduction vector is at least as wide as the psadbw result, just

- // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of

- // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64

- // result to v2i32 which will be removed by type legalization. If we/ widen

- // narrow vectors then we bitcast to v4i32 and extract v2i32.

- MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);

- Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);

- if (VT.getSizeInBits() > ResVT.getSizeInBits()) {

- // Fill the upper elements with zero to match the add width.

- assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");

- unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();

- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));

- Ops[0] = Sad;

- Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);

- } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {

- Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,

- DAG.getIntPtrConstant(0, DL));

- }

- // Preserve the reduction flag on the ADD. We may need to revisit for the

- // other operand.

- SDNodeFlags Flags;

- Flags.setVectorReduction(true);

- return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);

+ DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

}

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

@@ -44994,30 +47467,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

Mode == ShrinkMode::MULU16)

return SDValue();

+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

+ VT.getVectorNumElements() * 2);

+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

ArrayRef<SDValue> Ops) {

- // Shrink by adding truncate nodes and let DAGCombine fold with the

- // sources.

EVT InVT = Ops[0].getValueType();

- assert(InVT.getScalarType() == MVT::i32 &&

- "Unexpected scalar element type");

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

InVT.getVectorNumElements() / 2);

- EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

- InVT.getVectorNumElements());

- return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),

- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));

+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

};

- return SplitOpsAndApply(DAG, Subtarget, DL, VT,

- { Mul.getOperand(0), Mul.getOperand(1) },

- PMADDBuilder);

+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

}

// Attempt to turn this pattern into PMADDWD.

-// (mul (add (sext (build_vector)), (sext (build_vector))),

-// (add (sext (build_vector)), (sext (build_vector)))

+// (add (mul (sext (build_vector)), (sext (build_vector))),

+// (mul (sext (build_vector)), (sext (build_vector)))

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

const SDLoc &DL, EVT VT,

const X86Subtarget &Subtarget) {

@@ -45139,13 +47607,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

- const SDNodeFlags Flags = N->getFlags();

- if (Flags.hasVectorReduction()) {

- if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))

- return Sad;

- if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))

- return MAdd;

- }

EVT VT = N->getValueType(0);

SDValue Op0 = N->getOperand(0);

SDValue Op1 = N->getOperand(1);

@@ -45236,6 +47697,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,

SubusRHS = MinLHS;

else

return SDValue();

+ } else if (Op1.getOpcode() == ISD::TRUNCATE &&

+ Op1.getOperand(0).getOpcode() == ISD::UMIN &&

+ (EltVT == MVT::i8 || EltVT == MVT::i16)) {

+ // Special case where the UMIN has been truncated. Try to push the truncate

+ // further up. This is similar to the i32/i64 special processing.

+ SubusLHS = Op0;

+ SDValue MinLHS = Op1.getOperand(0).getOperand(0);

+ SDValue MinRHS = Op1.getOperand(0).getOperand(1);

+ EVT TruncVT = Op1.getOperand(0).getValueType();

+ if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||

+ TruncVT == MVT::v8i64)) &&

+ !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))

+ return SDValue();

+ SDValue OpToSaturate;

+ if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&

+ MinLHS.getOperand(0) == Op0)

+ OpToSaturate = MinRHS;

+ else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&

+ MinRHS.getOperand(0) == Op0)

+ OpToSaturate = MinLHS;

+ else

+ return SDValue();

+ // Saturate the non-extended input and then truncate it.

+ SDLoc DL(N);

+ SDValue SaturationConst =

+ DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),

+ VT.getScalarSizeInBits()),

+ DL, TruncVT);

+ SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,

+ SaturationConst);

+ SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);

} else

return SDValue();

@@ -45350,6 +47843,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

TargetLowering::DAGCombinerInfo &DCI,

const X86Subtarget &Subtarget) {

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");

+ unsigned EltSizeInBits = VT.getScalarSizeInBits();

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

return DAG.getUNDEF(VT);

@@ -45360,6 +47854,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

return getZeroVector(VT, Subtarget, DAG, DL);

SDValue Op0 = Ops[0];

+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

// Fold subvector loads into one.

// If needed, look through bitcasts to get to the load.

@@ -45376,13 +47871,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

}

// Repeated subvectors.

- if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {

+ if (IsSplat) {

// If this broadcast/subv_broadcast is inserted into both halves, use a

// larger broadcast/subv_broadcast.

if (Op0.getOpcode() == X86ISD::VBROADCAST ||

Op0.getOpcode() == X86ISD::SUBV_BROADCAST)

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

+ // If this broadcast_load is inserted into both halves, use a larger

+ // broadcast_load. Update other uses to use an extracted subvector.

+ if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);

+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);

+ SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};

+ SDValue BcastLd = DAG.getMemIntrinsicNode(

+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

+ MemIntr->getMemOperand());

+ DAG.ReplaceAllUsesOfValueWith(

+ Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));

+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

+ return BcastLd;

+ }

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))

@@ -45394,12 +47904,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

(Subtarget.hasAVX2() ||

- (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&

+ (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&

Op0.getOperand(0).getValueType() == VT.getScalarType())

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

- }

- bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

+ // concat_vectors(extract_subvector(broadcast(x)),

+ // extract_subvector(broadcast(x))) -> broadcast(x)

+ if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

+ Op0.getOperand(0).getValueType() == VT) {

+ if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

+ Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

+ return Op0.getOperand(0);

+ }

// Repeated opcode.

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

@@ -45409,6 +47926,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

})) {

unsigned NumOps = Ops.size();

switch (Op0.getOpcode()) {

+ case X86ISD::SHUFP: {

+ // Add SHUFPD support if/when necessary.

+ if (!IsSplat && VT.getScalarType() == MVT::f32 &&

+ llvm::all_of(Ops, [Op0](SDValue Op) {

+ return Op.getOperand(2) == Op0.getOperand(2);

+ })) {

+ SmallVector<SDValue, 2> LHS, RHS;

+ for (unsigned i = 0; i != NumOps; ++i) {

+ LHS.push_back(Ops[i].getOperand(0));

+ RHS.push_back(Ops[i].getOperand(1));

+ }

+ return DAG.getNode(Op0.getOpcode(), DL, VT,

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),

+ Op0.getOperand(2));

+ }

+ break;

+ }

case X86ISD::PSHUFHW:

case X86ISD::PSHUFLW:

case X86ISD::PSHUFD:

@@ -45435,8 +47970,42 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

return DAG.getBitcast(VT, Res);

}

break;

+ case X86ISD::VSHLI:

+ case X86ISD::VSRAI:

+ case X86ISD::VSRLI:

+ if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

+ (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

+ (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

+ llvm::all_of(Ops, [Op0](SDValue Op) {

+ return Op0.getOperand(1) == Op.getOperand(1);

+ })) {

+ SmallVector<SDValue, 2> Src;

+ for (unsigned i = 0; i != NumOps; ++i)

+ Src.push_back(Ops[i].getOperand(0));

+ return DAG.getNode(Op0.getOpcode(), DL, VT,

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),

+ Op0.getOperand(1));

+ }

+ break;

+ case X86ISD::VPERMI:

+ case X86ISD::VROTLI:

+ case X86ISD::VROTRI:

+ if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

+ llvm::all_of(Ops, [Op0](SDValue Op) {

+ return Op0.getOperand(1) == Op.getOperand(1);

+ })) {

+ SmallVector<SDValue, 2> Src;

+ for (unsigned i = 0; i != NumOps; ++i)

+ Src.push_back(Ops[i].getOperand(0));

+ return DAG.getNode(Op0.getOpcode(), DL, VT,

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),

+ Op0.getOperand(1));

+ }

+ break;

+ case X86ISD::PACKSS:

case X86ISD::PACKUS:

- if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {

+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

+ Subtarget.hasInt256()) {

SmallVector<SDValue, 2> LHS, RHS;

for (unsigned i = 0; i != NumOps; ++i) {

LHS.push_back(Ops[i].getOperand(0));

@@ -45450,6 +48019,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));

}

break;

+ case X86ISD::PALIGNR:

+ if (!IsSplat &&

+ ((VT.is256BitVector() && Subtarget.hasInt256()) ||

+ (VT.is512BitVector() && Subtarget.useBWIRegs())) &&

+ llvm::all_of(Ops, [Op0](SDValue Op) {

+ return Op0.getOperand(2) == Op.getOperand(2);

+ })) {

+ SmallVector<SDValue, 2> LHS, RHS;

+ for (unsigned i = 0; i != NumOps; ++i) {

+ LHS.push_back(Ops[i].getOperand(0));

+ RHS.push_back(Ops[i].getOperand(1));

+ }

+ return DAG.getNode(Op0.getOpcode(), DL, VT,

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),

+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),

+ Op0.getOperand(2));

+ }

+ break;

}

@@ -45539,7 +48126,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,

// if the insert or extract can be represented with a subregister operation.

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

- (IdxVal != 0 || !Vec.isUndef())) {

+ (IdxVal != 0 ||

+ !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

int ExtIdxVal = SubVec.getConstantOperandVal(1);

if (ExtIdxVal != 0) {

int VecNumElts = OpVT.getVectorNumElements();

@@ -45628,7 +48216,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

unsigned SelElts = SelVT.getVectorNumElements();

unsigned CastedElts = WideVT.getVectorNumElements();

- unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();

+ unsigned ExtIdx = Ext->getConstantOperandVal(1);

if (SelElts % CastedElts == 0) {

// The select has the same or more (narrower) elements than the extract

// operand. The extraction index gets scaled by that factor.

@@ -45673,6 +48261,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

MVT VT = N->getSimpleValueType(0);

SDValue InVec = N->getOperand(0);

+ unsigned IdxVal = N->getConstantOperandVal(1);

SDValue InVecBC = peekThroughBitcasts(InVec);

EVT InVecVT = InVec.getValueType();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

@@ -45690,7 +48279,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

isConcatenatedNot(InVecBC.getOperand(1))) {

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

- SDValue Concat = split256IntArith(InVecBC, DAG);

+ SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

}

@@ -45702,8 +48291,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

return V;

- unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

@@ -45753,6 +48340,43 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

}

+ // If we're extracting an upper subvector from a broadcast we should just

+ // extract the lowest subvector instead which should allow

+ // SimplifyDemandedVectorElts do more simplifications.

+ if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

+ InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))

+ return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());

+ // If we're extracting a broadcasted subvector, just use the source.

+ if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&

+ InVec.getOperand(0).getValueType() == VT)

+ return InVec.getOperand(0);

+ // Attempt to extract from the source of a shuffle vector.

+ if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&

+ (IdxVal % VT.getVectorNumElements()) == 0) {

+ SmallVector<int, 32> ShuffleMask;

+ SmallVector<int, 32> ScaledMask;

+ SmallVector<SDValue, 2> ShuffleInputs;

+ unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();

+ // Decode the shuffle mask and scale it so its shuffling subvectors.

+ if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

+ scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

+ unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();

+ if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

+ return DAG.getUNDEF(VT);

+ if (ScaledMask[SubVecIdx] == SM_SentinelZero)

+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

+ SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

+ if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {

+ unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

+ unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();

+ return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

+ SDLoc(N), VT.getSizeInBits());

+ }

// If we're extracting the lowest subvector and we're the only user,

// we may be able to perform this with a smaller vector width.

if (IdxVal == 0 && InVec.hasOneUse()) {

@@ -45825,13 +48449,30 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

Src.getOperand(1));

// Reduce v2i64 to v4i32 if we don't need the upper bits.

- // TODO: Move to DAGCombine?

- if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&

- Src.getValueType() == MVT::i64 && Src.hasOneUse() &&

- Src.getOperand(0).getScalarValueSizeInBits() <= 32)

- return DAG.getBitcast(

- VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

- DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));

+ // TODO: Move to DAGCombine/SimplifyDemandedBits?

+ if (VT == MVT::v2i64 || VT == MVT::v2f64) {

+ auto IsAnyExt64 = [](SDValue Op) {

+ if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())

+ return SDValue();

+ if (Op.getOpcode() == ISD::ANY_EXTEND &&

+ Op.getOperand(0).getScalarValueSizeInBits() <= 32)

+ return Op.getOperand(0);

+ if (auto *Ld = dyn_cast<LoadSDNode>(Op))

+ if (Ld->getExtensionType() == ISD::EXTLOAD &&

+ Ld->getMemoryVT().getScalarSizeInBits() <= 32)

+ return Op;

+ return SDValue();

+ };

+ if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))

+ return DAG.getBitcast(

+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

+ DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));

+ }

+ // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

+ Src.getOperand(0).getValueType() == MVT::x86mmx)

+ return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

return SDValue();

}

@@ -45902,13 +48543,16 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,

auto *Ld = cast<LoadSDNode>(In);

if (Ld->isSimple()) {

MVT SVT = In.getSimpleValueType().getVectorElementType();

- ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;

- EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,

- VT.getVectorNumElements());

+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG

+ ? ISD::SEXTLOAD

+ : ISD::ZEXTLOAD;

+ EVT MemVT =

+ EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

SDValue Load =

DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),

- Ld->getPointerInfo(), MemVT, Ld->getAlignment(),

+ Ld->getPointerInfo(), MemVT,

+ Ld->getOriginalAlign(),

Ld->getMemOperand()->getFlags());

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

return Load;

@@ -45945,6 +48589,196 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

+// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

+// extra instructions between the conversion due to going to scalar and back.

+static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

+ return SDValue();

+ if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

+ return SDValue();

+ if (N->getValueType(0) != MVT::f32 ||

+ N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

+ return SDValue();

+ SDLoc dl(N);

+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

+ N->getOperand(0).getOperand(0));

+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

+ DAG.getTargetConstant(4, dl, MVT::i32));

+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

+ DAG.getIntPtrConstant(0, dl));

+static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

+ return SDValue();

+ bool IsStrict = N->isStrictFPOpcode();

+ EVT VT = N->getValueType(0);

+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);

+ EVT SrcVT = Src.getValueType();

+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

+ return SDValue();

+ if (VT.getVectorElementType() != MVT::f32 &&

+ VT.getVectorElementType() != MVT::f64)

+ return SDValue();

+ unsigned NumElts = VT.getVectorNumElements();

+ if (NumElts == 1 || !isPowerOf2_32(NumElts))

+ return SDValue();

+ SDLoc dl(N);

+ // Convert the input to vXi16.

+ EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

+ Src = DAG.getBitcast(IntVT, Src);

+ // Widen to at least 8 input elements.

+ if (NumElts < 8) {

+ unsigned NumConcats = 8 / NumElts;

+ SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

+ : DAG.getConstant(0, dl, IntVT);

+ SmallVector<SDValue, 4> Ops(NumConcats, Fill);

+ Ops[0] = Src;

+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

+ }

+ // Destination is vXf32 with at least 4 elements.

+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

+ std::max(4U, NumElts));

+ SDValue Cvt, Chain;

+ if (IsStrict) {

+ Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

+ {N->getOperand(0), Src});

+ Chain = Cvt.getValue(1);

+ } else {

+ Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

+ }

+ if (NumElts < 4) {

+ assert(NumElts == 2 && "Unexpected size");

+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

+ DAG.getIntPtrConstant(0, dl));

+ }

+ if (IsStrict) {

+ // Extend to the original VT if necessary.

+ if (Cvt.getValueType() != VT) {

+ Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

+ {Chain, Cvt});

+ Chain = Cvt.getValue(1);

+ }

+ return DAG.getMergeValues({Cvt, Chain}, dl);

+ }

+ // Extend to the original VT if necessary.

+ return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

+// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to

+// cases where the loads have the same input chain and the output chains are

+// unused. This avoids any memory ordering issues.

+static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI) {

+ // Only do this if the chain result is unused.

+ if (N->hasAnyUseOfValue(1))

+ return SDValue();

+ auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

+ SDValue Ptr = MemIntrin->getBasePtr();

+ SDValue Chain = MemIntrin->getChain();

+ EVT VT = N->getSimpleValueType(0);

+ EVT MemVT = MemIntrin->getMemoryVT();

+ // Look at other users of our base pointer and try to find a wider broadcast.

+ // The input chain and the size of the memory VT must match.

+ for (SDNode *User : Ptr->uses())

+ if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&

+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

+ MemVT.getSizeInBits() &&

+ !User->hasAnyUseOfValue(1) &&

+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {

+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

+ VT.getSizeInBits());

+ Extract = DAG.getBitcast(VT, Extract);

+ return DCI.CombineTo(N, Extract, SDValue(User, 1));

+ }

+ return SDValue();

+static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

+ const X86Subtarget &Subtarget) {

+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

+ return SDValue();

+ EVT VT = N->getValueType(0);

+ SDValue Src = N->getOperand(0);

+ EVT SrcVT = Src.getValueType();

+ if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

+ SrcVT.getVectorElementType() != MVT::f32)

+ return SDValue();

+ unsigned NumElts = VT.getVectorNumElements();

+ if (NumElts == 1 || !isPowerOf2_32(NumElts))

+ return SDValue();

+ SDLoc dl(N);

+ // Widen to at least 4 input elements.

+ if (NumElts < 4)

+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

+ DAG.getConstantFP(0.0, dl, SrcVT));

+ // Destination is v8i16 with at least 8 elements.

+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

+ std::max(8U, NumElts));

+ SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,

+ DAG.getTargetConstant(4, dl, MVT::i32));

+ // Extract down to real number of elements.

+ if (NumElts < 8) {

+ EVT IntVT = VT.changeVectorElementTypeToInteger();

+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

+ DAG.getIntPtrConstant(0, dl));

+ }

+ return DAG.getBitcast(VT, Cvt);

+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

+ SDValue Src = N->getOperand(0);

+ // Turn MOVDQ2Q+simple_load into an mmx load.

+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

+ LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

+ if (LN->isSimple()) {

+ SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

+ LN->getBasePtr(),

+ LN->getPointerInfo(),

+ LN->getOriginalAlign(),

+ LN->getMemOperand()->getFlags());

+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

+ return NewLd;

+ }

+ return SDValue();

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

DAGCombinerInfo &DCI) const {

SelectionDAG &DAG = DCI.DAG;

@@ -45976,8 +48810,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case X86ISD::ADC: return combineADC(N, DAG, DCI);

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

case ISD::SHL: return combineShiftLeft(N, DAG);

- case ISD::SRA: return combineShiftRightArithmetic(N, DAG);

- case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);

+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

@@ -45986,6 +48820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

+ case X86ISD::VEXTRACT_STORE:

+ return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

case ISD::SINT_TO_FP:

case ISD::STRICT_SINT_TO_FP:

return combineSIntToFP(N, DAG, DCI, Subtarget);

@@ -45994,14 +48830,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

return combineUIntToFP(N, DAG, Subtarget);

case ISD::FADD:

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

- case ISD::FNEG: return combineFneg(N, DAG, Subtarget);

+ case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

- case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);

+ case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

case X86ISD::FXOR:

- case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);

+ case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

case X86ISD::FMIN:

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

case ISD::FMINNUM:

@@ -46010,8 +48846,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

case X86ISD::CVTP2SI:

case X86ISD::CVTP2UI:

+ case X86ISD::STRICT_CVTTP2SI:

case X86ISD::CVTTP2SI:

- case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);

+ case X86ISD::STRICT_CVTTP2UI:

+ case X86ISD::CVTTP2UI:

+ return combineCVTP2I_CVTTP2I(N, DAG, DCI);

+ case X86ISD::STRICT_CVTPH2PS:

+ case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

case X86ISD::BT: return combineBT(N, DAG, DCI);

case ISD::ANY_EXTEND:

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

@@ -46034,12 +48875,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case X86ISD::VSRAI:

case X86ISD::VSRLI:

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

+ case ISD::INSERT_VECTOR_ELT:

case X86ISD::PINSRB:

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

case X86ISD::SHUFP: // Handle all target specific shuffles

case X86ISD::INSERTPS:

case X86ISD::EXTRQI:

case X86ISD::INSERTQI:

+ case X86ISD::VALIGN:

case X86ISD::PALIGNR:

case X86ISD::VSHLDQ:

case X86ISD::VSRLDQ:

@@ -46071,12 +48914,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

case X86ISD::FMADD_RND:

case X86ISD::FMSUB:

+ case X86ISD::STRICT_FMSUB:

case X86ISD::FMSUB_RND:

case X86ISD::FNMADD:

+ case X86ISD::STRICT_FNMADD:

case X86ISD::FNMADD_RND:

case X86ISD::FNMSUB:

+ case X86ISD::STRICT_FNMSUB:

case X86ISD::FNMSUB_RND:

- case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);

+ case ISD::FMA:

+ case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

case X86ISD::FMADDSUB_RND:

case X86ISD::FMSUBADD_RND:

case X86ISD::FMADDSUB:

@@ -46092,6 +48939,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

case X86ISD::KSHIFTL:

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

+ case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

+ case ISD::STRICT_FP_EXTEND:

+ case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

+ case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

+ case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);

+ case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

}

return SDValue();

@@ -46240,27 +49093,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

return true;

}

-bool X86TargetLowering::

- isDesirableToCombineBuildVectorToShuffleTruncate(

- ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

- assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&

- "Element count mismatch");

- assert(

- Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&

- "Shuffle Mask expected to be legal");

- // For 32-bit elements VPERMD is better than shuffle+truncate.

- // TODO: After we improve lowerBuildVector, add execption for VPERMW.

- if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())

- return false;

- if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))

- return false;

- return true;

//===----------------------------------------------------------------------===//

// X86 Inline Assembly Support

//===----------------------------------------------------------------------===//

@@ -46301,7 +49133,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

}

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

- InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

const std::string &AsmStr = IA->getAsmString();

@@ -46424,7 +49256,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {

case 'y':

case 'x':

case 'v':

- case 'Y':

case 'l':

case 'k': // AVX512 masking registers.

return C_RegisterClass;

@@ -46461,7 +49292,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {

default:

break;

case 'z':

- case '0':

return C_Register;

case 'i':

case 'm':

@@ -46517,19 +49347,17 @@ TargetLowering::ConstraintWeight

if (type->isX86_MMXTy() && Subtarget.hasMMX())

weight = CW_SpecificReg;

break;

- case 'Y': {

- unsigned Size = StringRef(constraint).size();

- // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'

- char NextChar = Size == 2 ? constraint[1] : 'i';

- if (Size > 2)

+ case 'Y':

+ if (StringRef(constraint).size() != 2)

break;

- switch (NextChar) {

+ switch (constraint[1]) {

default:

return CW_Invalid;

// XMM0

case 'z':

- case '0':

- if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())

+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

+ ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

return CW_SpecificReg;

return CW_Invalid;

// Conditional OpMask regs (AVX512)

@@ -46542,7 +49370,7 @@ TargetLowering::ConstraintWeight

if (type->isX86_MMXTy() && Subtarget.hasMMX())

return weight;

return CW_Invalid;

- // Any SSE reg when ISA >= SSE2, same as 'Y'

+ // Any SSE reg when ISA >= SSE2, same as 'x'

case 'i':

case 't':

case '2':

@@ -46550,9 +49378,7 @@ TargetLowering::ConstraintWeight

return CW_Invalid;

break;

}

- // Fall through (handle "Y" constraint).

- LLVM_FALLTHROUGH;

- }

+ break;

case 'v':

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

weight = CW_Register;

@@ -46634,8 +49460,6 @@ LowerXConstraint(EVT ConstraintVT) const {

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

// 'f' like normal targets.

if (ConstraintVT.isFloatingPoint()) {

- if (Subtarget.hasSSE2())

- return "Y";

if (Subtarget.hasSSE1())

return "x";

}

@@ -46884,26 +49708,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

break;

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

if (Subtarget.is64Bit()) {

- if (VT == MVT::i32 || VT == MVT::f32)

- return std::make_pair(0U, &X86::GR32RegClass);

- if (VT == MVT::i16)

- return std::make_pair(0U, &X86::GR16RegClass);

if (VT == MVT::i8 || VT == MVT::i1)

return std::make_pair(0U, &X86::GR8RegClass);

- if (VT == MVT::i64 || VT == MVT::f64)

+ if (VT == MVT::i16)

+ return std::make_pair(0U, &X86::GR16RegClass);

+ if (VT == MVT::i32 || VT == MVT::f32)

+ return std::make_pair(0U, &X86::GR32RegClass);

+ if (VT != MVT::f80)

return std::make_pair(0U, &X86::GR64RegClass);

break;

}

LLVM_FALLTHROUGH;

// 32-bit fallthrough

case 'Q': // Q_REGS

- if (VT == MVT::i32 || VT == MVT::f32)

- return std::make_pair(0U, &X86::GR32_ABCDRegClass);

- if (VT == MVT::i16)

- return std::make_pair(0U, &X86::GR16_ABCDRegClass);

if (VT == MVT::i8 || VT == MVT::i1)

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

- if (VT == MVT::i64)

+ if (VT == MVT::i16)

+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);

+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);

+ if (VT != MVT::f80)

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

break;

case 'r': // GENERAL_REGS

@@ -46914,15 +49738,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

return std::make_pair(0U, &X86::GR16RegClass);

if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

return std::make_pair(0U, &X86::GR32RegClass);

- return std::make_pair(0U, &X86::GR64RegClass);

+ if (VT != MVT::f80)

+ return std::make_pair(0U, &X86::GR64RegClass);

+ break;

case 'R': // LEGACY_REGS

if (VT == MVT::i8 || VT == MVT::i1)

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

if (VT == MVT::i16)

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

- if (VT == MVT::i32 || !Subtarget.is64Bit())

+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

- return std::make_pair(0U, &X86::GR64_NOREXRegClass);

+ if (VT != MVT::f80)

+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);

+ break;

case 'f': // FP Stack registers.

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

// value to the correct fpstack register class.

@@ -46930,13 +49758,12 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

return std::make_pair(0U, &X86::RFP32RegClass);

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

return std::make_pair(0U, &X86::RFP64RegClass);

- return std::make_pair(0U, &X86::RFP80RegClass);

+ if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

+ return std::make_pair(0U, &X86::RFP80RegClass);

+ break;

case 'y': // MMX_REGS if MMX allowed.

if (!Subtarget.hasMMX()) break;

return std::make_pair(0U, &X86::VR64RegClass);

- case 'Y': // SSE_REGS if SSE2 allowed

- if (!Subtarget.hasSSE2()) break;

- LLVM_FALLTHROUGH;

case 'v':

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

if (!Subtarget.hasSSE1()) break;

@@ -46955,7 +49782,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

if (VConstraint && Subtarget.hasVLX())

return std::make_pair(0U, &X86::FR64XRegClass);

return std::make_pair(0U, &X86::FR64RegClass);

- // TODO: Handle i128 in FR128RegClass after it is tested well.

+ case MVT::i128:

+ if (Subtarget.is64Bit()) {

+ if (VConstraint && Subtarget.hasVLX())

+ return std::make_pair(0U, &X86::VR128XRegClass);

+ return std::make_pair(0U, &X86::VR128RegClass);

+ }

+ break;

// Vector types and fp128.

case MVT::f128:

case MVT::v16i8:

@@ -46979,6 +49812,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

if (Subtarget.hasAVX())

return std::make_pair(0U, &X86::VR256RegClass);

break;

+ case MVT::v64i8:

+ case MVT::v32i16:

case MVT::v8f64:

case MVT::v16f32:

case MVT::v16i32:

@@ -46997,14 +49832,50 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

case 'i':

case 't':

case '2':

- return getRegForInlineAsmConstraint(TRI, "Y", VT);

+ return getRegForInlineAsmConstraint(TRI, "x", VT);

case 'm':

if (!Subtarget.hasMMX()) break;

return std::make_pair(0U, &X86::VR64RegClass);

case 'z':

- case '0':

if (!Subtarget.hasSSE1()) break;

- return std::make_pair(X86::XMM0, &X86::VR128RegClass);

+ switch (VT.SimpleTy) {

+ default: break;

+ // Scalar SSE types.

+ case MVT::f32:

+ case MVT::i32:

+ return std::make_pair(X86::XMM0, &X86::FR32RegClass);

+ case MVT::f64:

+ case MVT::i64:

+ return std::make_pair(X86::XMM0, &X86::FR64RegClass);

+ case MVT::f128:

+ case MVT::v16i8:

+ case MVT::v8i16:

+ case MVT::v4i32:

+ case MVT::v2i64:

+ case MVT::v4f32:

+ case MVT::v2f64:

+ return std::make_pair(X86::XMM0, &X86::VR128RegClass);

+ // AVX types.

+ case MVT::v32i8:

+ case MVT::v16i16:

+ case MVT::v8i32:

+ case MVT::v4i64:

+ case MVT::v8f32:

+ case MVT::v4f64:

+ if (Subtarget.hasAVX())

+ return std::make_pair(X86::YMM0, &X86::VR256RegClass);

+ break;

+ case MVT::v64i8:

+ case MVT::v32i16:

+ case MVT::v8f64:

+ case MVT::v16f32:

+ case MVT::v16i32:

+ case MVT::v8i64:

+ if (Subtarget.hasAVX512())

+ return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

+ break;

+ }

+ break;

case 'k':

// This register class doesn't allocate k0 for masked vector operation.

if (Subtarget.hasAVX512()) {

@@ -47030,7 +49901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

// Use the default implementation in TargetLowering to convert the register

// constraint into a member of a register class.

- std::pair<unsigned, const TargetRegisterClass*> Res;

+ std::pair<Register, const TargetRegisterClass*> Res;

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

// Not found as a standard register?

@@ -47101,7 +49972,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

if (isGRClass(*Class)) {

unsigned Size = VT.getSizeInBits();

if (Size == 1) Size = 8;

- unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);

+ Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);

if (DestReg > 0) {

bool is64Bit = Subtarget.is64Bit();

const TargetRegisterClass *RC =

@@ -47217,8 +50088,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

// integer division, leaving the division as-is is a loss even in terms of

// size, because it will have to be scalarized, while the alternative code

// sequence can be performed in vector form.

- bool OptSize =

- Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);

+ bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);

return OptSize && !VT.isVector();

}

@@ -47275,10 +50145,35 @@ bool X86TargetLowering::supportSwiftError() const {

return Subtarget.is64Bit();

}

+/// Returns true if stack probing through a function call is requested.

+bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {

+ return !getStackProbeSymbolName(MF).empty();

+/// Returns true if stack probing through inline assembly is requested.

+bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {

+ // No inline stack probe for Windows, they have their own mechanism.

+ if (Subtarget.isOSWindows() ||

+ MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

+ return false;

+ // If the function specifically requests inline stack probes, emit them.

+ if (MF.getFunction().hasFnAttribute("probe-stack"))

+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

+ "inline-asm";

+ return false;

/// Returns the name of the symbol used to emit stack probes or the empty

/// string if not applicable.

StringRef

X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {

+ // Inline Stack probes disable stack probe call

+ if (hasInlineStackProbe(MF))

+ return "";

// If the function specifically requests stack probes, emit them.

if (MF.getFunction().hasFnAttribute("probe-stack"))

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();