1 files changed, 856 insertions, 872 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3a17099da38f..7f3dc90a2d73 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
@@ -24,680 +22,809 @@ namespace llvm {
 
   namespace X86ISD {
     // X86 Specific DAG Nodes
-    enum NodeType : unsigned {
-      // Start the numbering where the builtin ops leave off.
-      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-      /// Bit scan forward.
-      BSF,
-      /// Bit scan reverse.
-      BSR,
-
-      /// Double shift instructions. These correspond to
-      /// X86::SHLDxx and X86::SHRDxx instructions.
-      SHLD,
-      SHRD,
-
-      /// Bitwise logical AND of floating point values. This corresponds
-      /// to X86::ANDPS or X86::ANDPD.
-      FAND,
-
-      /// Bitwise logical OR of floating point values. This corresponds
-      /// to X86::ORPS or X86::ORPD.
-      FOR,
-
-      /// Bitwise logical XOR of floating point values. This corresponds
-      /// to X86::XORPS or X86::XORPD.
-      FXOR,
-
-      ///  Bitwise logical ANDNOT of floating point values. This
-      /// corresponds to X86::ANDNPS or X86::ANDNPD.
-      FANDN,
-
-      /// These operations represent an abstract X86 call
-      /// instruction, which includes a bunch of information.  In particular the
-      /// operands of these node are:
-      ///
-      ///     #0 - The incoming token chain
-      ///     #1 - The callee
-      ///     #2 - The number of arg bytes the caller pushes on the stack.
-      ///     #3 - The number of arg bytes the callee pops off the stack.
-      ///     #4 - The value to pass in AL/AX/EAX (optional)
-      ///     #5 - The value to pass in DL/DX/EDX (optional)
-      ///
-      /// The result values of these nodes are:
-      ///
-      ///     #0 - The outgoing token chain
-      ///     #1 - The first register result value (optional)
-      ///     #2 - The second register result value (optional)
-      ///
-      CALL,
-
-      /// Same as call except it adds the NoTrack prefix.
-      NT_CALL,
-
-      /// X86 compare and logical compare instructions.
-      CMP, COMI, UCOMI,
-
-      /// X86 bit-test instructions.
-      BT,
-
-      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
-      /// operand, usually produced by a CMP instruction.
-      SETCC,
-
-      /// X86 Select
-      SELECTS,
-
-      // Same as SETCC except it's materialized with a sbb and the value is all
-      // one's or all zero's.
-      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
-
-      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
-      /// Operands are two FP values to compare; result is a mask of
-      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
-      FSETCC,
-
-      /// X86 FP SETCC, similar to above, but with output as an i1 mask and
-      /// and a version with SAE.
-      FSETCCM, FSETCCM_SAE,
-
-      /// X86 conditional moves. Operand 0 and operand 1 are the two values
-      /// to select from. Operand 2 is the condition code, and operand 3 is the
-      /// flag operand produced by a CMP or TEST instruction.
-      CMOV,
-
-      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
-      /// is the block to branch if condition is true, operand 2 is the
-      /// condition code, and operand 3 is the flag operand produced by a CMP
-      /// or TEST instruction.
-      BRCOND,
-
-      /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
-      /// operand 1 is the target address.
-      NT_BRIND,
-
-      /// Return with a flag operand. Operand 0 is the chain operand, operand
-      /// 1 is the number of bytes of stack to pop.
-      RET_FLAG,
-
-      /// Return from interrupt. Operand 0 is the number of bytes to pop.
-      IRET,
-
-      /// Repeat fill, corresponds to X86::REP_STOSx.
-      REP_STOS,
-
-      /// Repeat move, corresponds to X86::REP_MOVSx.
-      REP_MOVS,
-
-      /// On Darwin, this node represents the result of the popl
-      /// at function entry, used for PIC code.
-      GlobalBaseReg,
-
-      /// A wrapper node for TargetConstantPool, TargetJumpTable,
-      /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
-      /// MCSymbol and TargetBlockAddress.
-      Wrapper,
-
-      /// Special wrapper used under X86-64 PIC mode for RIP
-      /// relative displacements.
-      WrapperRIP,
-
-      /// Copies a 64-bit value from an MMX vector to the low word
-      /// of an XMM vector, with the high word zero filled.
-      MOVQ2DQ,
-
-      /// Copies a 64-bit value from the low word of an XMM vector
-      /// to an MMX vector.
-      MOVDQ2Q,
-
-      /// Copies a 32-bit value from the low word of a MMX
-      /// vector to a GPR.
-      MMX_MOVD2W,
-
-      /// Copies a GPR into the low 32-bit word of a MMX vector
-      /// and zero out the high word.
-      MMX_MOVW2D,
-
-      /// Extract an 8-bit value from a vector and zero extend it to
-      /// i32, corresponds to X86::PEXTRB.
-      PEXTRB,
-
-      /// Extract a 16-bit value from a vector and zero extend it to
-      /// i32, corresponds to X86::PEXTRW.
-      PEXTRW,
-
-      /// Insert any element of a 4 x float vector into any element
-      /// of a destination 4 x floatvector.
-      INSERTPS,
-
-      /// Insert the lower 8-bits of a 32-bit value to a vector,
-      /// corresponds to X86::PINSRB.
-      PINSRB,
-
-      /// Insert the lower 16-bits of a 32-bit value to a vector,
-      /// corresponds to X86::PINSRW.
-      PINSRW,
-
-      /// Shuffle 16 8-bit values within a vector.
-      PSHUFB,
-
-      /// Compute Sum of Absolute Differences.
-      PSADBW,
-      /// Compute Double Block Packed Sum-Absolute-Differences
-      DBPSADBW,
-
-      /// Bitwise Logical AND NOT of Packed FP values.
-      ANDNP,
-
-      /// Blend where the selector is an immediate.
-      BLENDI,
-
-      /// Dynamic (non-constant condition) vector blend where only the sign bits
-      /// of the condition elements are used. This is used to enforce that the
-      /// condition mask is not valid for generic VSELECT optimizations. This
-      /// is also used to implement the intrinsics.
-      /// Operands are in VSELECT order: MASK, TRUE, FALSE
-      BLENDV,
-
-      /// Combined add and sub on an FP vector.
-      ADDSUB,
-
-      //  FP vector ops with rounding mode.
-      FADD_RND, FADDS, FADDS_RND,
-      FSUB_RND, FSUBS, FSUBS_RND,
-      FMUL_RND, FMULS, FMULS_RND,
-      FDIV_RND, FDIVS, FDIVS_RND,
-      FMAX_SAE, FMAXS_SAE,
-      FMIN_SAE, FMINS_SAE,
-      FSQRT_RND, FSQRTS, FSQRTS_RND,
-
-      // FP vector get exponent.
-      FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
-      // Extract Normalized Mantissas.
-      VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
-      // FP Scale.
-      SCALEF, SCALEF_RND,
-      SCALEFS, SCALEFS_RND,
-
-      // Unsigned Integer average.
-      AVG,
-
-      /// Integer horizontal add/sub.
-      HADD,
-      HSUB,
-
-      /// Floating point horizontal add/sub.
-      FHADD,
-      FHSUB,
-
-      // Detect Conflicts Within a Vector
-      CONFLICT,
-
-      /// Floating point max and min.
-      FMAX, FMIN,
-
-      /// Commutative FMIN and FMAX.
-      FMAXC, FMINC,
-
-      /// Scalar intrinsic floating point max and min.
-      FMAXS, FMINS,
-
-      /// Floating point reciprocal-sqrt and reciprocal approximation.
-      /// Note that these typically require refinement
-      /// in order to obtain suitable precision.
-      FRSQRT, FRCP,
-
-      // AVX-512 reciprocal approximations with a little more precision.
-      RSQRT14, RSQRT14S, RCP14, RCP14S,
-
-      // Thread Local Storage.
-      TLSADDR,
-
-      // Thread Local Storage. A call to get the start address
-      // of the TLS block for the current module.
-      TLSBASEADDR,
-
-      // Thread Local Storage.  When calling to an OS provided
-      // thunk at the address from an earlier relocation.
-      TLSCALL,
+  enum NodeType : unsigned {
+    // Start the numbering where the builtin ops leave off.
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    /// Bit scan forward.
+    BSF,
+    /// Bit scan reverse.
+    BSR,
+
+    /// X86 funnel/double shift i16 instructions. These correspond to
+    /// X86::SHLDW and X86::SHRDW instructions which have different amt
+    /// modulo rules to generic funnel shifts.
+    /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+    FSHL,
+    FSHR,
+
+    /// Bitwise logical AND of floating point values. This corresponds
+    /// to X86::ANDPS or X86::ANDPD.
+    FAND,
+
+    /// Bitwise logical OR of floating point values. This corresponds
+    /// to X86::ORPS or X86::ORPD.
+    FOR,
+
+    /// Bitwise logical XOR of floating point values. This corresponds
+    /// to X86::XORPS or X86::XORPD.
+    FXOR,
+
+    ///  Bitwise logical ANDNOT of floating point values. This
+    /// corresponds to X86::ANDNPS or X86::ANDNPD.
+    FANDN,
+
+    /// These operations represent an abstract X86 call
+    /// instruction, which includes a bunch of information.  In particular the
+    /// operands of these node are:
+    ///
+    ///     #0 - The incoming token chain
+    ///     #1 - The callee
+    ///     #2 - The number of arg bytes the caller pushes on the stack.
+    ///     #3 - The number of arg bytes the callee pops off the stack.
+    ///     #4 - The value to pass in AL/AX/EAX (optional)
+    ///     #5 - The value to pass in DL/DX/EDX (optional)
+    ///
+    /// The result values of these nodes are:
+    ///
+    ///     #0 - The outgoing token chain
+    ///     #1 - The first register result value (optional)
+    ///     #2 - The second register result value (optional)
+    ///
+    CALL,
 
-      // Exception Handling helpers.
-      EH_RETURN,
+    /// Same as call except it adds the NoTrack prefix.
+    NT_CALL,
 
-      // SjLj exception handling setjmp.
-      EH_SJLJ_SETJMP,
+    /// X86 compare and logical compare instructions.
+    CMP,
+    FCMP,
+    COMI,
+    UCOMI,
 
-      // SjLj exception handling longjmp.
-      EH_SJLJ_LONGJMP,
+    /// X86 bit-test instructions.
+    BT,
 
-      // SjLj exception handling dispatch.
-      EH_SJLJ_SETUP_DISPATCH,
+    /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+    /// operand, usually produced by a CMP instruction.
+    SETCC,
 
-      /// Tail call return. See X86TargetLowering::LowerCall for
-      /// the list of operands.
-      TC_RETURN,
+    /// X86 Select
+    SELECTS,
 
-      // Vector move to low scalar and zero higher vector elements.
-      VZEXT_MOVL,
+    // Same as SETCC except it's materialized with a sbb and the value is all
+    // one's or all zero's.
+    SETCC_CARRY, // R = carry_bit ? ~0 : 0
 
-      // Vector integer truncate.
-      VTRUNC,
-      // Vector integer truncate with unsigned/signed saturation.
-      VTRUNCUS, VTRUNCS,
+    /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+    /// Operands are two FP values to compare; result is a mask of
+    /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+    FSETCC,
 
-      // Masked version of the above. Used when less than a 128-bit result is
-      // produced since the mask only applies to the lower elements and can't
-      // be represented by a select.
-      // SRC, PASSTHRU, MASK
-      VMTRUNC, VMTRUNCUS, VMTRUNCS,
-
-      // Vector FP extend.
-      VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
-
-      // Vector FP round.
-      VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
-
-      // Masked version of above. Used for v2f64->v4f32.
-      // SRC, PASSTHRU, MASK
-      VMFPROUND,
-
-      // 128-bit vector logical left / right shift
-      VSHLDQ, VSRLDQ,
-
-      // Vector shift elements
-      VSHL, VSRL, VSRA,
-
-      // Vector variable shift
-      VSHLV, VSRLV, VSRAV,
-
-      // Vector shift elements by immediate
-      VSHLI, VSRLI, VSRAI,
-
-      // Shifts of mask registers.
-      KSHIFTL, KSHIFTR,
-
-      // Bit rotate by immediate
-      VROTLI, VROTRI,
-
-      // Vector packed double/float comparison.
-      CMPP,
-
-      // Vector integer comparisons.
-      PCMPEQ, PCMPGT,
-
-      // v8i16 Horizontal minimum and position.
-      PHMINPOS,
-
-      MULTISHIFT,
-
-      /// Vector comparison generating mask bits for fp and
-      /// integer signed and unsigned data types.
-      CMPM,
-      // Vector comparison with SAE for FP values
-      CMPM_SAE,
-
-      // Arithmetic operations with FLAGS results.
-      ADD, SUB, ADC, SBB, SMUL, UMUL,
-      OR, XOR, AND,
-
-      // Bit field extract.
-      BEXTR,
-
-      // Zero High Bits Starting with Specified Bit Position.
-      BZHI,
-
-      // X86-specific multiply by immediate.
-      MUL_IMM,
-
-      // Vector sign bit extraction.
-      MOVMSK,
-
-      // Vector bitwise comparisons.
-      PTEST,
-
-      // Vector packed fp sign bitwise comparisons.
-      TESTP,
-
-      // OR/AND test for masks.
-      KORTEST,
-      KTEST,
-
-      // ADD for masks.
-      KADD,
-
-      // Several flavors of instructions with vector shuffle behaviors.
-      // Saturated signed/unnsigned packing.
-      PACKSS,
-      PACKUS,
-      // Intra-lane alignr.
-      PALIGNR,
-      // AVX512 inter-lane alignr.
-      VALIGN,
-      PSHUFD,
-      PSHUFHW,
-      PSHUFLW,
-      SHUFP,
-      // VBMI2 Concat & Shift.
-      VSHLD,
-      VSHRD,
-      VSHLDV,
-      VSHRDV,
-      //Shuffle Packed Values at 128-bit granularity.
-      SHUF128,
-      MOVDDUP,
-      MOVSHDUP,
-      MOVSLDUP,
-      MOVLHPS,
-      MOVHLPS,
-      MOVSD,
-      MOVSS,
-      UNPCKL,
-      UNPCKH,
-      VPERMILPV,
-      VPERMILPI,
-      VPERMI,
-      VPERM2X128,
-
-      // Variable Permute (VPERM).
-      // Res = VPERMV MaskV, V0
-      VPERMV,
-
-      // 3-op Variable Permute (VPERMT2).
-      // Res = VPERMV3 V0, MaskV, V1
-      VPERMV3,
-
-      // Bitwise ternary logic.
-      VPTERNLOG,
-      // Fix Up Special Packed Float32/64 values.
-      VFIXUPIMM, VFIXUPIMM_SAE,
-      VFIXUPIMMS, VFIXUPIMMS_SAE,
-      // Range Restriction Calculation For Packed Pairs of Float32/64 values.
-      VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
-      // Reduce - Perform Reduction Transformation on scalar\packed FP.
-      VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
-      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-      // Also used by the legacy (V)ROUND intrinsics where we mask out the
-      // scaling part of the immediate.
-      VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
-      // Tests Types Of a FP Values for packed types.
-      VFPCLASS,
-      // Tests Types Of a FP Values for scalar types.
-      VFPCLASSS,
-
-      // Broadcast (splat) scalar or element 0 of a vector. If the operand is
-      // a vector, this node may change the vector length as part of the splat.
-      VBROADCAST,
-      // Broadcast mask to vector.
-      VBROADCASTM,
-      // Broadcast subvector to vector.
-      SUBV_BROADCAST,
-
-      /// SSE4A Extraction and Insertion.
-      EXTRQI, INSERTQI,
-
-      // XOP arithmetic/logical shifts.
-      VPSHA, VPSHL,
-      // XOP signed/unsigned integer comparisons.
-      VPCOM, VPCOMU,
-      // XOP packed permute bytes.
-      VPPERM,
-      // XOP two source permutation.
-      VPERMIL2,
-
-      // Vector multiply packed unsigned doubleword integers.
-      PMULUDQ,
-      // Vector multiply packed signed doubleword integers.
-      PMULDQ,
-      // Vector Multiply Packed UnsignedIntegers with Round and Scale.
-      MULHRS,
-
-      // Multiply and Add Packed Integers.
-      VPMADDUBSW, VPMADDWD,
-
-      // AVX512IFMA multiply and add.
-      // NOTE: These are different than the instruction and perform
-      // op0 x op1 + op2.
-      VPMADD52L, VPMADD52H,
-
-      // VNNI
-      VPDPBUSD,
-      VPDPBUSDS,
-      VPDPWSSD,
-      VPDPWSSDS,
-
-      // FMA nodes.
-      // We use the target independent ISD::FMA for the non-inverted case.
-      FNMADD,
-      FMSUB,
-      FNMSUB,
-      FMADDSUB,
-      FMSUBADD,
-
-      // FMA with rounding mode.
-      FMADD_RND,
-      FNMADD_RND,
-      FMSUB_RND,
-      FNMSUB_RND,
-      FMADDSUB_RND,
-      FMSUBADD_RND,
-
-      // Compress and expand.
-      COMPRESS,
-      EXPAND,
-
-      // Bits shuffle
-      VPSHUFBITQMB,
-
-      // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
-      SINT_TO_FP_RND, UINT_TO_FP_RND,
-      SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
-      SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
-
-      // Vector float/double to signed/unsigned integer.
-      CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
-      // Scalar float/double to signed/unsigned integer.
-      CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
-
-      // Vector float/double to signed/unsigned integer with truncation.
-      CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
-      // Scalar float/double to signed/unsigned integer with truncation.
-      CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
-
-      // Vector signed/unsigned integer to float/double.
-      CVTSI2P, CVTUI2P,
-
-      // Masked versions of above. Used for v2f64->v4f32.
-      // SRC, PASSTHRU, MASK
-      MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
-      MCVTSI2P, MCVTUI2P,
-
-      // Vector float to bfloat16.
-      // Convert TWO packed single data to one packed BF16 data
-      CVTNE2PS2BF16, 
-      // Convert packed single data to packed BF16 data
-      CVTNEPS2BF16,
-      // Masked version of above.
-      // SRC, PASSTHRU, MASK
-      MCVTNEPS2BF16,
-
-      // Dot product of BF16 pairs to accumulated into
-      // packed single precision.
-      DPBF16PS,
-
-      // Save xmm argument registers to the stack, according to %al. An operator
-      // is needed so that this can be expanded with control flow.
-      VASTART_SAVE_XMM_REGS,
-
-      // Windows's _chkstk call to do stack probing.
-      WIN_ALLOCA,
-
-      // For allocating variable amounts of stack space when using
-      // segmented stacks. Check if the current stacklet has enough space, and
-      // falls back to heap allocation if not.
-      SEG_ALLOCA,
-
-      // Memory barriers.
-      MEMBARRIER,
-      MFENCE,
-
-      // Store FP status word into i16 register.
-      FNSTSW16r,
-
-      // Store contents of %ah into %eflags.
-      SAHF,
-
-      // Get a random integer and indicate whether it is valid in CF.
-      RDRAND,
-
-      // Get a NIST SP800-90B & C compliant random integer and
-      // indicate whether it is valid in CF.
-      RDSEED,
-
-      // Protection keys
-      // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
-      // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
-      // value for ECX.
-      RDPKRU, WRPKRU,
-
-      // SSE42 string comparisons.
-      // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
-      // will emit one or two instructions based on which results are used. If
-      // flags and index/mask this allows us to use a single instruction since
-      // we won't have to pick and opcode for flags. Instead we can rely on the
-      // DAG to CSE everything and decide at isel.
-      PCMPISTR,
-      PCMPESTR,
-
-      // Test if in transactional execution.
-      XTEST,
-
-      // ERI instructions.
-      RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
-      RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
-
-      // Conversions between float and half-float.
-      CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
-
-      // Masked version of above.
-      // SRC, RND, PASSTHRU, MASK
-      MCVTPS2PH,
-
-      // Galois Field Arithmetic Instructions
-      GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
-
-      // LWP insert record.
-      LWPINS,
-
-      // User level wait
-      UMWAIT, TPAUSE,
-
-      // Enqueue Stores Instructions
-      ENQCMD, ENQCMDS,
-
-      // For avx512-vp2intersect
-      VP2INTERSECT,
-
-      /// X86 strict FP compare instructions.
-      STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
-      STRICT_FCMPS,
-
-      // Vector packed double/float comparison.
-      STRICT_CMPP,
-
-      /// Vector comparison generating mask bits for fp and
-      /// integer signed and unsigned data types.
-      STRICT_CMPM,
-
-      // Vector float/double to signed/unsigned integer with truncation.
-      STRICT_CVTTP2SI, STRICT_CVTTP2UI,
-
-      // Vector FP extend.
-      STRICT_VFPEXT,
-
-      // Vector FP round.
-      STRICT_VFPROUND,
-
-      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-      // Also used by the legacy (V)ROUND intrinsics where we mask out the
-      // scaling part of the immediate.
-      STRICT_VRNDSCALE,
-
-      // Vector signed/unsigned integer to float/double.
-      STRICT_CVTSI2P, STRICT_CVTUI2P,
-
-      // Compare and swap.
-      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      LCMPXCHG8_DAG,
-      LCMPXCHG16_DAG,
-      LCMPXCHG8_SAVE_EBX_DAG,
-      LCMPXCHG16_SAVE_RBX_DAG,
-
-      /// LOCK-prefixed arithmetic read-modify-write instructions.
-      /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
-      LADD, LSUB, LOR, LXOR, LAND,
-
-      // Load, scalar_to_vector, and zero extend.
-      VZEXT_LOAD,
-
-      // extract_vector_elt, store.
-      VEXTRACT_STORE,
-
-      // scalar broadcast from memory
-      VBROADCAST_LOAD,
-
-      // Store FP control world into i16 memory.
-      FNSTCW16m,
-
-      /// This instruction implements FP_TO_SINT with the
-      /// integer destination in memory and a FP reg source.  This corresponds
-      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
-      /// has two inputs (token chain and address) and two outputs (int value
-      /// and token chain). Memory VT specifies the type to store to.
-      FP_TO_INT_IN_MEM,
-
-      /// This instruction implements SINT_TO_FP with the
-      /// integer source in memory and FP reg result.  This corresponds to the
-      /// X86::FILD*m instructions. It has two inputs (token chain and address)
-      /// and two outputs (FP value and token chain). FILD_FLAG also produces a
-      /// flag). The integer source type is specified by the memory VT.
-      FILD,
-      FILD_FLAG,
-
-      /// This instruction implements a fp->int store from FP stack
-      /// slots. This corresponds to the fist instruction. It takes a
-      /// chain operand, value to store, address, and glue. The memory VT
-      /// specifies the type to store as.
-      FIST,
-
-      /// This instruction implements an extending load to FP stack slots.
-      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
-      /// operand, and ptr to load from. The memory VT specifies the type to
-      /// load from.
-      FLD,
+    /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+    /// and a version with SAE.
+    FSETCCM,
+    FSETCCM_SAE,
 
-      /// This instruction implements a truncating store from FP stack
-      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
-      /// chain operand, value to store, address, and glue. The memory VT
-      /// specifies the type to store as.
-      FST,
-
-      /// This instruction grabs the address of the next argument
-      /// from a va_list. (reads and modifies the va_list in memory)
-      VAARG_64,
-
-      // Vector truncating store with unsigned/signed saturation
-      VTRUNCSTOREUS, VTRUNCSTORES,
-      // Vector truncating masked store with unsigned/signed saturation
-      VMTRUNCSTOREUS, VMTRUNCSTORES,
-
-      // X86 specific gather and scatter
-      MGATHER, MSCATTER,
-
-      // WARNING: Do not add anything in the end unless you want the node to
-      // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
-      // opcodes will be thought as target memory ops!
-    };
+    /// X86 conditional moves. Operand 0 and operand 1 are the two values
+    /// to select from. Operand 2 is the condition code, and operand 3 is the
+    /// flag operand produced by a CMP or TEST instruction.
+    CMOV,
+
+    /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+    /// is the block to branch if condition is true, operand 2 is the
+    /// condition code, and operand 3 is the flag operand produced by a CMP
+    /// or TEST instruction.
+    BRCOND,
+
+    /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+    /// operand 1 is the target address.
+    NT_BRIND,
+
+    /// Return with a flag operand. Operand 0 is the chain operand, operand
+    /// 1 is the number of bytes of stack to pop.
+    RET_FLAG,
+
+    /// Return from interrupt. Operand 0 is the number of bytes to pop.
+    IRET,
+
+    /// Repeat fill, corresponds to X86::REP_STOSx.
+    REP_STOS,
+
+    /// Repeat move, corresponds to X86::REP_MOVSx.
+    REP_MOVS,
+
+    /// On Darwin, this node represents the result of the popl
+    /// at function entry, used for PIC code.
+    GlobalBaseReg,
+
+    /// A wrapper node for TargetConstantPool, TargetJumpTable,
+    /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+    /// MCSymbol and TargetBlockAddress.
+    Wrapper,
+
+    /// Special wrapper used under X86-64 PIC mode for RIP
+    /// relative displacements.
+    WrapperRIP,
+
+    /// Copies a 64-bit value from an MMX vector to the low word
+    /// of an XMM vector, with the high word zero filled.
+    MOVQ2DQ,
+
+    /// Copies a 64-bit value from the low word of an XMM vector
+    /// to an MMX vector.
+    MOVDQ2Q,
+
+    /// Copies a 32-bit value from the low word of a MMX
+    /// vector to a GPR.
+    MMX_MOVD2W,
+
+    /// Copies a GPR into the low 32-bit word of a MMX vector
+    /// and zero out the high word.
+    MMX_MOVW2D,
+
+    /// Extract an 8-bit value from a vector and zero extend it to
+    /// i32, corresponds to X86::PEXTRB.
+    PEXTRB,
+
+    /// Extract a 16-bit value from a vector and zero extend it to
+    /// i32, corresponds to X86::PEXTRW.
+    PEXTRW,
+
+    /// Insert any element of a 4 x float vector into any element
+    /// of a destination 4 x floatvector.
+    INSERTPS,
+
+    /// Insert the lower 8-bits of a 32-bit value to a vector,
+    /// corresponds to X86::PINSRB.
+    PINSRB,
+
+    /// Insert the lower 16-bits of a 32-bit value to a vector,
+    /// corresponds to X86::PINSRW.
+    PINSRW,
+
+    /// Shuffle 16 8-bit values within a vector.
+    PSHUFB,
+
+    /// Compute Sum of Absolute Differences.
+    PSADBW,
+    /// Compute Double Block Packed Sum-Absolute-Differences
+    DBPSADBW,
+
+    /// Bitwise Logical AND NOT of Packed FP values.
+    ANDNP,
+
+    /// Blend where the selector is an immediate.
+    BLENDI,
+
+    /// Dynamic (non-constant condition) vector blend where only the sign bits
+    /// of the condition elements are used. This is used to enforce that the
+    /// condition mask is not valid for generic VSELECT optimizations. This
+    /// is also used to implement the intrinsics.
+    /// Operands are in VSELECT order: MASK, TRUE, FALSE
+    BLENDV,
+
+    /// Combined add and sub on an FP vector.
+    ADDSUB,
+
+    //  FP vector ops with rounding mode.
+    FADD_RND,
+    FADDS,
+    FADDS_RND,
+    FSUB_RND,
+    FSUBS,
+    FSUBS_RND,
+    FMUL_RND,
+    FMULS,
+    FMULS_RND,
+    FDIV_RND,
+    FDIVS,
+    FDIVS_RND,
+    FMAX_SAE,
+    FMAXS_SAE,
+    FMIN_SAE,
+    FMINS_SAE,
+    FSQRT_RND,
+    FSQRTS,
+    FSQRTS_RND,
+
+    // FP vector get exponent.
+    FGETEXP,
+    FGETEXP_SAE,
+    FGETEXPS,
+    FGETEXPS_SAE,
+    // Extract Normalized Mantissas.
+    VGETMANT,
+    VGETMANT_SAE,
+    VGETMANTS,
+    VGETMANTS_SAE,
+    // FP Scale.
+    SCALEF,
+    SCALEF_RND,
+    SCALEFS,
+    SCALEFS_RND,
+
+    // Unsigned Integer average.
+    AVG,
+
+    /// Integer horizontal add/sub.
+    HADD,
+    HSUB,
+
+    /// Floating point horizontal add/sub.
+    FHADD,
+    FHSUB,
+
+    // Detect Conflicts Within a Vector
+    CONFLICT,
+
+    /// Floating point max and min.
+    FMAX,
+    FMIN,
+
+    /// Commutative FMIN and FMAX.
+    FMAXC,
+    FMINC,
+
+    /// Scalar intrinsic floating point max and min.
+    FMAXS,
+    FMINS,
+
+    /// Floating point reciprocal-sqrt and reciprocal approximation.
+    /// Note that these typically require refinement
+    /// in order to obtain suitable precision.
+    FRSQRT,
+    FRCP,
+
+    // AVX-512 reciprocal approximations with a little more precision.
+    RSQRT14,
+    RSQRT14S,
+    RCP14,
+    RCP14S,
+
+    // Thread Local Storage.
+    TLSADDR,
+
+    // Thread Local Storage. A call to get the start address
+    // of the TLS block for the current module.
+    TLSBASEADDR,
+
+    // Thread Local Storage.  When calling to an OS provided
+    // thunk at the address from an earlier relocation.
+    TLSCALL,
+
+    // Exception Handling helpers.
+    EH_RETURN,
+
+    // SjLj exception handling setjmp.
+    EH_SJLJ_SETJMP,
+
+    // SjLj exception handling longjmp.
+    EH_SJLJ_LONGJMP,
+
+    // SjLj exception handling dispatch.
+    EH_SJLJ_SETUP_DISPATCH,
+
+    /// Tail call return. See X86TargetLowering::LowerCall for
+    /// the list of operands.
+    TC_RETURN,
+
+    // Vector move to low scalar and zero higher vector elements.
+    VZEXT_MOVL,
+
+    // Vector integer truncate.
+    VTRUNC,
+    // Vector integer truncate with unsigned/signed saturation.
+    VTRUNCUS,
+    VTRUNCS,
+
+    // Masked version of the above. Used when less than a 128-bit result is
+    // produced since the mask only applies to the lower elements and can't
+    // be represented by a select.
+    // SRC, PASSTHRU, MASK
+    VMTRUNC,
+    VMTRUNCUS,
+    VMTRUNCS,
+
+    // Vector FP extend.
+    VFPEXT,
+    VFPEXT_SAE,
+    VFPEXTS,
+    VFPEXTS_SAE,
+
+    // Vector FP round.
+    VFPROUND,
+    VFPROUND_RND,
+    VFPROUNDS,
+    VFPROUNDS_RND,
+
+    // Masked version of above. Used for v2f64->v4f32.
+    // SRC, PASSTHRU, MASK
+    VMFPROUND,
+
+    // 128-bit vector logical left / right shift
+    VSHLDQ,
+    VSRLDQ,
+
+    // Vector shift elements
+    VSHL,
+    VSRL,
+    VSRA,
+
+    // Vector variable shift
+    VSHLV,
+    VSRLV,
+    VSRAV,
+
+    // Vector shift elements by immediate
+    VSHLI,
+    VSRLI,
+    VSRAI,
+
+    // Shifts of mask registers.
+    KSHIFTL,
+    KSHIFTR,
+
+    // Bit rotate by immediate
+    VROTLI,
+    VROTRI,
+
+    // Vector packed double/float comparison.
+    CMPP,
+
+    // Vector integer comparisons.
+    PCMPEQ,
+    PCMPGT,
+
+    // v8i16 Horizontal minimum and position.
+    PHMINPOS,
+
+    MULTISHIFT,
+
+    /// Vector comparison generating mask bits for fp and
+    /// integer signed and unsigned data types.
+    CMPM,
+    // Vector comparison with SAE for FP values
+    CMPM_SAE,
+
+    // Arithmetic operations with FLAGS results.
+    ADD,
+    SUB,
+    ADC,
+    SBB,
+    SMUL,
+    UMUL,
+    OR,
+    XOR,
+    AND,
+
+    // Bit field extract.
+    BEXTR,
+
+    // Zero High Bits Starting with Specified Bit Position.
+    BZHI,
+
+    // Parallel extract and deposit.
+    PDEP,
+    PEXT,
+
+    // X86-specific multiply by immediate.
+    MUL_IMM,
+
+    // Vector sign bit extraction.
+    MOVMSK,
+
+    // Vector bitwise comparisons.
+    PTEST,
+
+    // Vector packed fp sign bitwise comparisons.
+    TESTP,
+
+    // OR/AND test for masks.
+    KORTEST,
+    KTEST,
+
+    // ADD for masks.
+    KADD,
+
+    // Several flavors of instructions with vector shuffle behaviors.
+    // Saturated signed/unnsigned packing.
+    PACKSS,
+    PACKUS,
+    // Intra-lane alignr.
+    PALIGNR,
+    // AVX512 inter-lane alignr.
+    VALIGN,
+    PSHUFD,
+    PSHUFHW,
+    PSHUFLW,
+    SHUFP,
+    // VBMI2 Concat & Shift.
+    VSHLD,
+    VSHRD,
+    VSHLDV,
+    VSHRDV,
+    // Shuffle Packed Values at 128-bit granularity.
+    SHUF128,
+    MOVDDUP,
+    MOVSHDUP,
+    MOVSLDUP,
+    MOVLHPS,
+    MOVHLPS,
+    MOVSD,
+    MOVSS,
+    UNPCKL,
+    UNPCKH,
+    VPERMILPV,
+    VPERMILPI,
+    VPERMI,
+    VPERM2X128,
+
+    // Variable Permute (VPERM).
+    // Res = VPERMV MaskV, V0
+    VPERMV,
+
+    // 3-op Variable Permute (VPERMT2).
+    // Res = VPERMV3 V0, MaskV, V1
+    VPERMV3,
+
+    // Bitwise ternary logic.
+    VPTERNLOG,
+    // Fix Up Special Packed Float32/64 values.
+    VFIXUPIMM,
+    VFIXUPIMM_SAE,
+    VFIXUPIMMS,
+    VFIXUPIMMS_SAE,
+    // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+    VRANGE,
+    VRANGE_SAE,
+    VRANGES,
+    VRANGES_SAE,
+    // Reduce - Perform Reduction Transformation on scalar\packed FP.
+    VREDUCE,
+    VREDUCE_SAE,
+    VREDUCES,
+    VREDUCES_SAE,
+    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+    // Also used by the legacy (V)ROUND intrinsics where we mask out the
+    // scaling part of the immediate.
+    VRNDSCALE,
+    VRNDSCALE_SAE,
+    VRNDSCALES,
+    VRNDSCALES_SAE,
+    // Tests Types Of a FP Values for packed types.
+    VFPCLASS,
+    // Tests Types Of a FP Values for scalar types.
+    VFPCLASSS,
+
+    // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+    // a vector, this node may change the vector length as part of the splat.
+    VBROADCAST,
+    // Broadcast mask to vector.
+    VBROADCASTM,
+    // Broadcast subvector to vector.
+    SUBV_BROADCAST,
+
+    /// SSE4A Extraction and Insertion.
+    EXTRQI,
+    INSERTQI,
+
+    // XOP arithmetic/logical shifts.
+    VPSHA,
+    VPSHL,
+    // XOP signed/unsigned integer comparisons.
+    VPCOM,
+    VPCOMU,
+    // XOP packed permute bytes.
+    VPPERM,
+    // XOP two source permutation.
+    VPERMIL2,
+
+    // Vector multiply packed unsigned doubleword integers.
+    PMULUDQ,
+    // Vector multiply packed signed doubleword integers.
+    PMULDQ,
+    // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+    MULHRS,
+
+    // Multiply and Add Packed Integers.
+    VPMADDUBSW,
+    VPMADDWD,
+
+    // AVX512IFMA multiply and add.
+    // NOTE: These are different than the instruction and perform
+    // op0 x op1 + op2.
+    VPMADD52L,
+    VPMADD52H,
+
+    // VNNI
+    VPDPBUSD,
+    VPDPBUSDS,
+    VPDPWSSD,
+    VPDPWSSDS,
+
+    // FMA nodes.
+    // We use the target independent ISD::FMA for the non-inverted case.
+    FNMADD,
+    FMSUB,
+    FNMSUB,
+    FMADDSUB,
+    FMSUBADD,
+
+    // FMA with rounding mode.
+    FMADD_RND,
+    FNMADD_RND,
+    FMSUB_RND,
+    FNMSUB_RND,
+    FMADDSUB_RND,
+    FMSUBADD_RND,
+
+    // Compress and expand.
+    COMPRESS,
+    EXPAND,
+
+    // Bits shuffle
+    VPSHUFBITQMB,
+
+    // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+    SINT_TO_FP_RND,
+    UINT_TO_FP_RND,
+    SCALAR_SINT_TO_FP,
+    SCALAR_UINT_TO_FP,
+    SCALAR_SINT_TO_FP_RND,
+    SCALAR_UINT_TO_FP_RND,
+
+    // Vector float/double to signed/unsigned integer.
+    CVTP2SI,
+    CVTP2UI,
+    CVTP2SI_RND,
+    CVTP2UI_RND,
+    // Scalar float/double to signed/unsigned integer.
+    CVTS2SI,
+    CVTS2UI,
+    CVTS2SI_RND,
+    CVTS2UI_RND,
+
+    // Vector float/double to signed/unsigned integer with truncation.
+    CVTTP2SI,
+    CVTTP2UI,
+    CVTTP2SI_SAE,
+    CVTTP2UI_SAE,
+    // Scalar float/double to signed/unsigned integer with truncation.
+    CVTTS2SI,
+    CVTTS2UI,
+    CVTTS2SI_SAE,
+    CVTTS2UI_SAE,
+
+    // Vector signed/unsigned integer to float/double.
+    CVTSI2P,
+    CVTUI2P,
+
+    // Masked versions of above. Used for v2f64->v4f32.
+    // SRC, PASSTHRU, MASK
+    MCVTP2SI,
+    MCVTP2UI,
+    MCVTTP2SI,
+    MCVTTP2UI,
+    MCVTSI2P,
+    MCVTUI2P,
+
+    // Vector float to bfloat16.
+    // Convert TWO packed single data to one packed BF16 data
+    CVTNE2PS2BF16,
+    // Convert packed single data to packed BF16 data
+    CVTNEPS2BF16,
+    // Masked version of above.
+    // SRC, PASSTHRU, MASK
+    MCVTNEPS2BF16,
+
+    // Dot product of BF16 pairs to accumulated into
+    // packed single precision.
+    DPBF16PS,
+
+    // Save xmm argument registers to the stack, according to %al. An operator
+    // is needed so that this can be expanded with control flow.
+    VASTART_SAVE_XMM_REGS,
+
+    // Windows's _chkstk call to do stack probing.
+    WIN_ALLOCA,
+
+    // For allocating variable amounts of stack space when using
+    // segmented stacks. Check if the current stacklet has enough space, and
+    // falls back to heap allocation if not.
+    SEG_ALLOCA,
+
+    // For allocating stack space when using stack clash protector.
+    // Allocation is performed by block, and each block is probed.
+    PROBED_ALLOCA,
+
+    // Memory barriers.
+    MEMBARRIER,
+    MFENCE,
+
+    // Get a random integer and indicate whether it is valid in CF.
+    RDRAND,
+
+    // Get a NIST SP800-90B & C compliant random integer and
+    // indicate whether it is valid in CF.
+    RDSEED,
+
+    // Protection keys
+    // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+    // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+    // value for ECX.
+    RDPKRU,
+    WRPKRU,
+
+    // SSE42 string comparisons.
+    // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+    // will emit one or two instructions based on which results are used. If
+    // flags and index/mask this allows us to use a single instruction since
+    // we won't have to pick and opcode for flags. Instead we can rely on the
+    // DAG to CSE everything and decide at isel.
+    PCMPISTR,
+    PCMPESTR,
+
+    // Test if in transactional execution.
+    XTEST,
+
+    // ERI instructions.
+    RSQRT28,
+    RSQRT28_SAE,
+    RSQRT28S,
+    RSQRT28S_SAE,
+    RCP28,
+    RCP28_SAE,
+    RCP28S,
+    RCP28S_SAE,
+    EXP2,
+    EXP2_SAE,
+
+    // Conversions between float and half-float.
+    CVTPS2PH,
+    CVTPH2PS,
+    CVTPH2PS_SAE,
+
+    // Masked version of above.
+    // SRC, RND, PASSTHRU, MASK
+    MCVTPS2PH,
+
+    // Galois Field Arithmetic Instructions
+    GF2P8AFFINEINVQB,
+    GF2P8AFFINEQB,
+    GF2P8MULB,
+
+    // LWP insert record.
+    LWPINS,
+
+    // User level wait
+    UMWAIT,
+    TPAUSE,
+
+    // Enqueue Stores Instructions
+    ENQCMD,
+    ENQCMDS,
+
+    // For avx512-vp2intersect
+    VP2INTERSECT,
+
+    /// X86 strict FP compare instructions.
+    STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+    STRICT_FCMPS,
+
+    // Vector packed double/float comparison.
+    STRICT_CMPP,
+
+    /// Vector comparison generating mask bits for fp and
+    /// integer signed and unsigned data types.
+    STRICT_CMPM,
+
+    // Vector float/double to signed/unsigned integer with truncation.
+    STRICT_CVTTP2SI,
+    STRICT_CVTTP2UI,
+
+    // Vector FP extend.
+    STRICT_VFPEXT,
+
+    // Vector FP round.
+    STRICT_VFPROUND,
+
+    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+    // Also used by the legacy (V)ROUND intrinsics where we mask out the
+    // scaling part of the immediate.
+    STRICT_VRNDSCALE,
+
+    // Vector signed/unsigned integer to float/double.
+    STRICT_CVTSI2P,
+    STRICT_CVTUI2P,
+
+    // Strict FMA nodes.
+    STRICT_FNMADD,
+    STRICT_FMSUB,
+    STRICT_FNMSUB,
+
+    // Conversions between float and half-float.
+    STRICT_CVTPS2PH,
+    STRICT_CVTPH2PS,
+
+    // Compare and swap.
+    LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    LCMPXCHG8_DAG,
+    LCMPXCHG16_DAG,
+    LCMPXCHG8_SAVE_EBX_DAG,
+    LCMPXCHG16_SAVE_RBX_DAG,
+
+    /// LOCK-prefixed arithmetic read-modify-write instructions.
+    /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+    LADD,
+    LSUB,
+    LOR,
+    LXOR,
+    LAND,
+
+    // Load, scalar_to_vector, and zero extend.
+    VZEXT_LOAD,
+
+    // extract_vector_elt, store.
+    VEXTRACT_STORE,
+
+    // scalar broadcast from memory
+    VBROADCAST_LOAD,
+
+    // Store FP control world into i16 memory.
+    FNSTCW16m,
+
+    /// This instruction implements FP_TO_SINT with the
+    /// integer destination in memory and a FP reg source.  This corresponds
+    /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+    /// has two inputs (token chain and address) and two outputs (int value
+    /// and token chain). Memory VT specifies the type to store to.
+    FP_TO_INT_IN_MEM,
+
+    /// This instruction implements SINT_TO_FP with the
+    /// integer source in memory and FP reg result.  This corresponds to the
+    /// X86::FILD*m instructions. It has two inputs (token chain and address)
+    /// and two outputs (FP value and token chain). The integer source type is
+    /// specified by the memory VT.
+    FILD,
+
+    /// This instruction implements a fp->int store from FP stack
+    /// slots. This corresponds to the fist instruction. It takes a
+    /// chain operand, value to store, address, and glue. The memory VT
+    /// specifies the type to store as.
+    FIST,
+
+    /// This instruction implements an extending load to FP stack slots.
+    /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+    /// operand, and ptr to load from. The memory VT specifies the type to
+    /// load from.
+    FLD,
+
+    /// This instruction implements a truncating store from FP stack
+    /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+    /// chain operand, value to store, address, and glue. The memory VT
+    /// specifies the type to store as.
+    FST,
+
+    /// This instruction grabs the address of the next argument
+    /// from a va_list. (reads and modifies the va_list in memory)
+    VAARG_64,
+
+    // Vector truncating store with unsigned/signed saturation
+    VTRUNCSTOREUS,
+    VTRUNCSTORES,
+    // Vector truncating masked store with unsigned/signed saturation
+    VMTRUNCSTOREUS,
+    VMTRUNCSTORES,
+
+    // X86 specific gather and scatter
+    MGATHER,
+    MSCATTER,
+
+    // WARNING: Do not add anything in the end unless you want the node to
+    // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+    // opcodes will be thought as target memory ops!
+  };
   } // end namespace X86ISD
 
   /// Define some predicates that are used for node matching.
@@ -717,7 +844,10 @@ namespace llvm {
 
     /// If Op is a constant whose elements are all the same constant or
     /// undefined, return true and return the constant value in \p SplatVal.
-    bool isConstantSplat(SDValue Op, APInt &SplatVal);
+    /// If we have undef bits that don't cover an entire element, we treat these
+    /// as zero if AllowPartialUndefs is set, else we fail and return false.
+    bool isConstantSplat(SDValue Op, APInt &SplatVal,
+                         bool AllowPartialUndefs = true);
   } // end namespace X86
 
   //===--------------------------------------------------------------------===//
@@ -756,19 +886,7 @@ namespace llvm {
     unsigned getByValTypeAlignment(Type *Ty,
                                    const DataLayout &DL) const override;
 
-    /// Returns the target specific optimal type for load
-    /// and store operations as a result of memset, memcpy, and memmove
-    /// lowering. If DstAlign is zero that means it's safe to destination
-    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-    /// means there isn't a need to check it against alignment requirement,
-    /// probably because the source does not need to be loaded. If 'IsMemset' is
-    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-    /// source is constant so it does not need to be loaded.
-    /// It returns EVT::Other if the type should be determined using generic
-    /// target-independent logic.
-    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+    EVT getOptimalMemOpType(const MemOp &Op,
                             const AttributeList &FuncAttributes) const override;
 
     /// Returns true if it's safe to use load / store of the
@@ -805,19 +923,6 @@ namespace llvm {
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    // Return true if it is profitable to combine a BUILD_VECTOR with a
-    // stride-pattern to a shuffle and a truncate.
-    // Example of such a combine:
-    // v4i32 build_vector((extract_elt V, 1),
-    //                    (extract_elt V, 3),
-    //                    (extract_elt V, 5),
-    //                    (extract_elt V, 7))
-    //  -->
-    // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
-    // v4i64)
-    bool isDesirableToCombineBuildVectorToShuffleTruncate(
-        ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;
-
     /// Return true if the target has native support for
     /// the specified value type and it is 'desirable' to use the type for the
     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
@@ -830,15 +935,12 @@ namespace llvm {
     /// and some i16 instructions are slow.
     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
-    /// Return 1 if we can compute the negated form of the specified expression
-    /// for the same cost as the expression itself, or 2 if we can compute the
-    /// negated form more cheaply than the expression itself. Else return 0.
-    char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
-                            bool ForCodeSize, unsigned Depth) const override;
-
-    /// If isNegatibleForFree returns true, return the newly negated expression.
+    /// Return the newly negated expression if the cost is not expensive and
+    /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
+    /// do the negation.
     SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                  bool LegalOperations, bool ForCodeSize,
+                                 NegatibleCost &Cost,
                                  unsigned Depth) const override;
 
     MachineBasicBlock *
@@ -934,7 +1036,8 @@ namespace llvm {
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
 
-    bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+    bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+                                      const APInt &DemandedElts,
                                       TargetLoweringOpt &TLO) const override;
 
     /// Determine which of the bits specified in Mask are known to be either
@@ -958,6 +1061,12 @@ namespace llvm {
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth) const override;
 
+    bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
+                                                    const APInt &DemandedElts,
+                                                    unsigned MaskIndex,
+                                                    TargetLoweringOpt &TLO,
+                                                    unsigned Depth) const;
+
     bool SimplifyDemandedBitsForTargetNode(SDValue Op,
                                            const APInt &DemandedBits,
                                            const APInt &DemandedElts,
@@ -1047,6 +1156,8 @@ namespace llvm {
     int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
+    /// This is used to enable splatted operand transforms for vector shifts
+    /// and vector funnel shifts.
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
     /// Add x86-specific opcodes to the default list.
@@ -1075,6 +1186,10 @@ namespace llvm {
     bool isZExtFree(EVT VT1, EVT VT2) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
+    bool shouldSinkOperands(Instruction *I,
+                            SmallVectorImpl<Use *> &Ops) const override;
+    bool shouldConvertPhiType(Type *From, Type *To) const override;
+
     /// Return true if folding a vector load into ExtVal (a sign, zero, or any
     /// extend node) is profitable.
     bool isVectorLoadExtDesirable(SDValue) const override;
@@ -1171,7 +1286,8 @@ namespace llvm {
     /// Overflow nodes should get combined/lowered to optimal instructions
     /// (they should allow eliminating explicit compares by getting flags from
     /// math ops).
-    bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;
+    bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                              bool MathUsed) const override;
 
     bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
                                       unsigned AddrSpace) const override {
@@ -1194,12 +1310,12 @@ namespace llvm {
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
     virtual bool needsFixedCatchObjects() const override;
@@ -1227,8 +1343,10 @@ namespace llvm {
     /// offset as appropriate.
     Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
 
-    std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
-                                          SDValue StackSlot,
+    std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
+                                          SDValue Chain, SDValue Pointer,
+                                          MachinePointerInfo PtrInfo,
+                                          Align Alignment,
                                           SelectionDAG &DAG) const;
 
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
@@ -1236,6 +1354,8 @@ namespace llvm {
     /// Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
+    bool softPromoteHalfType() const override { return true; }
+
     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                       EVT VT) const override;
 
@@ -1251,6 +1371,8 @@ namespace llvm {
 
     bool supportSwiftError() const override;
 
+    bool hasStackProbeSymbol(MachineFunction &MF) const override;
+    bool hasInlineStackProbe(MachineFunction &MF) const override;
     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
 
     unsigned getStackProbeSize(MachineFunction &MF) const;
@@ -1314,7 +1436,7 @@ namespace llvm {
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
                              const SDLoc &dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
-                             ISD::ArgFlagsTy Flags) const;
+                             ISD::ArgFlagsTy Flags, bool isByval) const;
 
     // Call lowering helpers.
 
@@ -1340,8 +1462,9 @@ namespace llvm {
 
     unsigned getAddressSpace(void) const;
 
-    SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned,
+    SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
                             SDValue &Chain) const;
+    SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
 
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1365,8 +1488,8 @@ namespace llvm {
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -1431,7 +1554,7 @@ namespace llvm {
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
     TargetLoweringBase::AtomicExpansionKind
-    shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
@@ -1464,26 +1587,23 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
                                          MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
-                                           MachineBasicBlock *BB) const;
-
     MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
-                                           MachineBasicBlock *BB) const;
-
     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
                                             MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
-                                            MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
@@ -1497,32 +1617,25 @@ namespace llvm {
     MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
-                                     MachineBasicBlock *MBB) const;
-
     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
                                              MachineBasicBlock *MBB) const;
 
-    /// Convert a comparison if required by the subtarget.
-    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
-
     /// Emit flags for the given setcc condition and operands. Also returns the
     /// corresponding X86 condition code constant in X86CC.
     SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG,
-                              SDValue &X86CC, SDValue &Chain,
-                              bool IsSignaling) const;
+                              SDValue &X86CC) const;
 
     /// Check if replacement of SQRT with RSQRT should be disabled.
-    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+    bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
 
     /// Use rsqrt* to speed up sqrt calculations.
-    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+    SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
                             int &RefinementSteps, bool &UseOneConstNR,
                             bool Reciprocal) const override;
 
     /// Use rcp* to speed up fdiv calculations.
-    SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+    SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
                              int &RefinementSteps) const override;
 
     /// Reassociate floating point divisions into multiply by reciprocal.
@@ -1537,101 +1650,14 @@ namespace llvm {
                              const TargetLibraryInfo *libInfo);
   } // end namespace X86
 
-  // Base class for all X86 non-masked store operations.
-  class X86StoreSDNode : public MemSDNode {
-  public:
-    X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
-                   SDVTList VTs, EVT MemVT,
-                   MachineMemOperand *MMO)
-      :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-    const SDValue &getValue() const { return getOperand(1); }
-    const SDValue &getBasePtr() const { return getOperand(2); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTORES ||
-        N->getOpcode() == X86ISD::VTRUNCSTOREUS;
-    }
-  };
-
-  // Base class for all X86 masked store operations.
-  // The class has the same order of operands as MaskedStoreSDNode for
-  // convenience.
-  class X86MaskedStoreSDNode : public MemSDNode {
-  public:
-    X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
-                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                         MachineMemOperand *MMO)
-      : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-
-    const SDValue &getValue()   const { return getOperand(1); }
-    const SDValue &getBasePtr() const { return getOperand(2); }
-    const SDValue &getMask()    const { return getOperand(3); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
-        N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
-    }
-  };
-
-  // X86 Truncating Store with Signed saturation.
-  class TruncSStoreSDNode : public X86StoreSDNode {
-  public:
-    TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
-                        SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
-      : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTORES;
-    }
-  };
-
-  // X86 Truncating Store with Unsigned saturation.
-  class TruncUSStoreSDNode : public X86StoreSDNode {
-  public:
-    TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
-                      SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
-      : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
-    }
-  };
-
-  // X86 Truncating Masked Store with Signed saturation.
-  class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
-  public:
-    MaskedTruncSStoreSDNode(unsigned Order,
-                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                         MachineMemOperand *MMO)
-      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTORES;
-    }
-  };
-
-  // X86 Truncating Masked Store with Unsigned saturation.
-  class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
-  public:
-    MaskedTruncUSStoreSDNode(unsigned Order,
-                            const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                            MachineMemOperand *MMO)
-      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
-    }
-  };
-
   // X86 specific Gather/Scatter nodes.
   // The class has the same order of operands as MaskedGatherScatterSDNode for
   // convenience.
-  class X86MaskedGatherScatterSDNode : public MemSDNode {
+  class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
   public:
-    X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
-                                 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                                 MachineMemOperand *MMO)
-        : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
+    // This is a intended as a utility and should never be directly created.
+    X86MaskedGatherScatterSDNode() = delete;
+    ~X86MaskedGatherScatterSDNode() = delete;
 
     const SDValue &getBasePtr() const { return getOperand(3); }
     const SDValue &getIndex()   const { return getOperand(4); }
@@ -1646,11 +1672,6 @@ namespace llvm {
 
   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
   public:
-    X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                          EVT MemVT, MachineMemOperand *MMO)
-        : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
-                                       MMO) {}
-
     const SDValue &getPassThru() const { return getOperand(1); }
 
     static bool classof(const SDNode *N) {
@@ -1660,11 +1681,6 @@ namespace llvm {
 
   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
   public:
-    X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                           EVT MemVT, MachineMemOperand *MMO)
-        : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
-                                       MMO) {}
-
     const SDValue &getValue() const { return getOperand(1); }
 
     static bool classof(const SDNode *N) {
@@ -1673,47 +1689,15 @@ namespace llvm {
   };
 
   /// Generate unpacklo/unpackhi shuffle mask.
-  template <typename T = int>
-  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
-                               bool Unary) {
-    assert(Mask.empty() && "Expected an empty shuffle mask vector");
-    int NumElts = VT.getVectorNumElements();
-    int NumEltsInLane = 128 / VT.getScalarSizeInBits();
-    for (int i = 0; i < NumElts; ++i) {
-      unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
-      int Pos = (i % NumEltsInLane) / 2 + LaneStart;
-      Pos += (Unary ? 0 : NumElts * (i % 2));
-      Pos += (Lo ? 0 : NumEltsInLane / 2);
-      Mask.push_back(Pos);
-    }
-  }
-
-  /// Helper function to scale a shuffle or target shuffle mask, replacing each
-  /// mask index with the scaled sequential indices for an equivalent narrowed
-  /// mask. This is the reverse process to canWidenShuffleElements, but can
-  /// always succeed.
-  template <typename T>
-  void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
-                        SmallVectorImpl<T> &ScaledMask) {
-    assert(0 < Scale && "Unexpected scaling factor");
-    size_t NumElts = Mask.size();
-    ScaledMask.assign(NumElts * Scale, -1);
-
-    for (size_t i = 0; i != NumElts; ++i) {
-      int M = Mask[i];
-
-      // Repeat sentinel values in every mask element.
-      if (M < 0) {
-        for (size_t s = 0; s != Scale; ++s)
-          ScaledMask[(Scale * i) + s] = M;
-        continue;
-      }
-
-      // Scale mask element and increment across each mask element.
-      for (size_t s = 0; s != Scale; ++s)
-        ScaledMask[(Scale * i) + s] = (Scale * M) + s;
-    }
-  }
+  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                               bool Unary);
+
+  /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+  /// imposed by AVX and specific to the unary pattern. Example:
+  /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+  /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+  void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H