aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td41
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp10
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp387
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrAVX512.td39
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td74
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h2
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp57
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h3
8 files changed, 514 insertions, 99 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
index dc18a59a30ba..83a23d4ad680 100644
--- a/contrib/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
- "Use 16-bit divide for positive values less than 65536">;
+ "Use 32-bit divide for positive values less than 2^32">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
@@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
+ FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
@@ -760,6 +761,42 @@ def : Proc<"bdver4", [
FeatureMWAITX
]>;
+// TODO: The scheduler model falls to BTVER2 model.
+// The znver1 model has to be put in place.
+// Zen
+def: ProcessorModel<"znver1", BtVer2Model, [
+ FeatureADX,
+ FeatureAES,
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureCLFLUSHOPT,
+ FeatureCMPXCHG16B,
+ FeatureF16C,
+ FeatureFMA,
+ FeatureFSGSBase,
+ FeatureFXSR,
+ FeatureFastLZCNT,
+ FeatureLAHFSAHF,
+ FeatureLZCNT,
+ FeatureMMX,
+ FeatureMOVBE,
+ FeatureMWAITX,
+ FeaturePCLMUL,
+ FeaturePOPCNT,
+ FeaturePRFCHW,
+ FeatureRDRAND,
+ FeatureRDSEED,
+ FeatureSHA,
+ FeatureSMAP,
+ FeatureSSE4A,
+ FeatureSlowSHLD,
+ FeatureX87,
+ FeatureXSAVE,
+ FeatureXSAVEC,
+ FeatureXSAVEOPT,
+ FeatureXSAVES]>;
+
def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8b66790679d9..8ab4c0616880 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -183,16 +183,6 @@ namespace {
void PreprocessISelDAG() override;
- inline bool immSext8(SDNode *N) const {
- return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
- }
-
- // True if the 64-bit immediate fits in a 32-bit sign-extended field.
- inline bool i64immSExt32(SDNode *N) const {
- uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
- return (int64_t)v == (int32_t)v;
- }
-
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index db76ddf04c06..787dff99367e 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
- // Bypass expensive divides on Atom when compiling with O2.
+ // Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
- addBypassSlowDiv(64, 16);
+ addBypassSlowDiv(64, 32);
}
if (Subtarget.isTargetKnownWindowsMSVC() ||
@@ -1280,6 +1280,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
@@ -1306,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
- if (Subtarget.hasDQI()) {
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
- }
+
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
@@ -8090,6 +8089,37 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+ ArrayRef<int> Mask,const EVT &VectorType,
+ bool &IsZeroSideLeft) {
+ int NextElement = -1;
+ // Check if the Mask's nonzero elements are in increasing order.
+ for (int i = 0, e = Zeroable.size(); i < e; i++) {
+ // Checks if the mask's zeros elements are built from only zeros.
+ if (Mask[i] == -1)
+ return false;
+ if (Zeroable[i])
+ continue;
+ // Find the lowest non zero element
+ if (NextElement == -1) {
+ NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+ IsZeroSideLeft = NextElement != 0;
+ }
+ // Exit if the mask's non zero elements are not in increasing order.
+ if (NextElement != Mask[i])
+ return false;
+ NextElement++;
+ }
+ return true;
+}
+
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
@@ -8145,6 +8175,46 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
}
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+ unsigned convertBit = 0;
+ for (int i = 0, e = Zeroable.size(); i < e; i++)
+ convertBit |= !(Zeroable[i]) << i;
+ return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsLeftZeroSide = true;
+ if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+ IsLeftZeroSide))
+ return SDValue();
+ unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+ unsigned NumElts = VT.getVectorNumElements();
+ assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+ "Unexpected number of vector elements");
+ SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+ Subtarget, DAG, DL);
+ SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+ return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+ DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+ ZeroVector);
+}
+
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -12159,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Result;
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
@@ -12222,12 +12297,17 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Shift;
- // If we have VLX support, we can use VALIGN.
- if (Subtarget.hasVLX())
+ // If we have VLX support, we can use VALIGN or VEXPAND.
+ if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+ }
+
// Try to use PALIGNR.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
Mask, Subtarget, DAG))
@@ -12328,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
@@ -12392,12 +12477,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Shift;
- // If we have VLX support, we can use VALIGN.
- if (Subtarget.hasVLX())
+ // If we have VLX support, we can use VALIGN or EXPAND.
+ if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+ }
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12754,6 +12844,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12796,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+ V2, DAG, Subtarget))
+ return V;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12832,6 +12928,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
@@ -12889,6 +12989,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+ V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
@@ -12953,6 +13057,10 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
@@ -13089,9 +13197,9 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
@@ -15187,13 +15295,13 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
MVT InVT = In.getSimpleValueType();
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
- return SDValue();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
+ (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
- assert(InVT.getVectorElementType() == MVT::i1);
+ if (InVT.getVectorElementType() != MVT::i1)
+ return SDValue();
// Extend VT if the target is 256 or 128bit vector and VLX is not supported.
MVT ExtVT = VT;
@@ -15910,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
}
}
+ // Sometimes flags can be set either with an AND or with an SRL/SHL
+ // instruction. SRL/SHL variant should be preferred for masks longer than this
+ // number of bits.
+ const int ShiftToAndMaxMaskWidth = 32;
+ const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
// which may be the result of a CAST. We use the variable 'Op', which is the
// non-casted variable when we check for possible users.
@@ -15958,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// If we have a constant logical shift that's only used in a comparison
// against zero turn it into an equivalent AND. This allows turning it into
// a TEST instruction later.
- if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+ if (ZeroCheck && Op->hasOneUse() &&
isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
@@ -15968,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
APInt Mask = ArithOp.getOpcode() == ISD::SRL
? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
- if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
break;
Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
DAG.getConstant(Mask, dl, VT));
@@ -15977,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::AND:
// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
- // because a TEST instruction will be better.
+ // because a TEST instruction will be better. However, AND should be
+ // preferred if the instruction can be combined into ANDN.
if (!hasNonFlagsUse(Op)) {
SDValue Op0 = ArithOp->getOperand(0);
SDValue Op1 = ArithOp->getOperand(1);
EVT VT = ArithOp.getValueType();
bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+ bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+ // If we cannot select an ANDN instruction, check if we can replace
+ // AND+IMM64 with a shift before giving up. This is possible for masks
+ // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+ if (!isProperAndn) {
+ if (!ZeroCheck)
+ break;
+
+ assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+ auto *CN = dyn_cast<ConstantSDNode>(Op1);
+ if (!CN)
+ break;
+
+ const APInt &Mask = CN->getAPIntValue();
+ if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+ break; // Prefer TEST instruction.
+
+ unsigned BitWidth = Mask.getBitWidth();
+ unsigned LeadingOnes = Mask.countLeadingOnes();
+ unsigned TrailingZeros = Mask.countTrailingZeros();
+
+ if (LeadingOnes + TrailingZeros == BitWidth) {
+ assert(TrailingZeros < VT.getSizeInBits() &&
+ "Shift amount should be less than the type width");
+ MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+ Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+ break;
+ }
+
+ unsigned LeadingZeros = Mask.countLeadingZeros();
+ unsigned TrailingOnes = Mask.countTrailingOnes();
+
+ if (LeadingZeros + TrailingOnes == BitWidth) {
+ assert(LeadingZeros < VT.getSizeInBits() &&
+ "Shift amount should be less than the type width");
+ MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+ Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+ break;
+ }
- // But if we can combine this into an ANDN operation, then create an AND
- // now and allow it to be pattern matched into an ANDN.
- if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
break;
+ }
}
LLVM_FALLTHROUGH;
case ISD::SUB:
@@ -16008,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: {
- if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ if (!NeedTruncation && ZeroCheck) {
if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
return EFLAGS;
}
@@ -17283,17 +17438,20 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
unsigned NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
- return SDValue();
-
- if (VT.is512BitVector() && InVTElt != MVT::i1) {
+ if (VT.is512BitVector() && InVTElt != MVT::i1 &&
+ (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
}
- assert (InVTElt == MVT::i1 && "Unexpected vector type");
- MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ if (InVTElt != MVT::i1)
+ return SDValue();
+
+ MVT ExtVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
SDValue V;
if (Subtarget.hasDQI()) {
V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
@@ -17302,7 +17460,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
- if (VT.is512BitVector())
+ if (ExtVT == VT)
return V;
}
@@ -18418,13 +18576,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
- SDValue Op0 = ShAmt.getOperand(0);
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
- ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
+ ShAmt = ShAmt.getOperand(0);
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
+ ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
} else if (Subtarget.hasSSE41() &&
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
- ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+ ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
} else {
SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -21643,14 +21801,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
if (VT == MVT::v16i8 ||
- (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
unsigned ShiftOpcode = Op->getOpcode();
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
- if (Subtarget.hasSSE41()) {
+ if (VT.is512BitVector()) {
+ // On AVX512BW targets we make use of the fact that VSELECT lowers
+ // to a masked blend which selects bytes based just on the sign bit
+ // extracted to a mask.
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ } else if (Subtarget.hasSSE41()) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
@@ -28633,17 +28803,20 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() != ISD::VSELECT)
return SDValue();
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- // Check if the first operand is all zeros.This situation only
- // applies to avx512.
- if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
+ // Check if the first operand is all zeros and Cond type is vXi1.
+ // This situation only applies to avx512.
+ if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
+ CondVT.getVectorElementType() == MVT::i1) {
//Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
+ DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
+ DL, CondVT));
//Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
}
- assert(CondVT.isVector() && "Vector select expects a vector selector!");
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
@@ -29282,11 +29455,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Combine:
+/// Combine brcond/cmov/setcc/.. based on comparing the result of
+/// atomic_load_add to use EFLAGS produced by the addition
+/// directly if possible. For example:
+///
+/// (setcc (cmp (atomic_load_add x, -C) C), COND_E)
+/// becomes:
+/// (setcc (LADD x, -C), COND_E)
+///
+/// and
/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
-/// to:
+/// becomes:
/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
-/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+///
/// Note that this is only legal for some op/cc combinations.
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
SelectionDAG &DAG) {
@@ -29295,7 +29476,13 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
- // This only applies to variations of the common case:
+ // Can't replace the cmp if it has more uses than the one we're looking at.
+ // FIXME: We would like to be able to handle this, but would need to make sure
+ // all uses were updated.
+ if (!Cmp.hasOneUse())
+ return SDValue();
+
+ // This applies to variations of the common case:
// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
@@ -29314,8 +29501,9 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
return SDValue();
auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
- if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+ if (!CmpRHSC)
return SDValue();
+ APInt Comparand = CmpRHSC->getAPIntValue();
const unsigned Opc = CmpLHS.getOpcode();
@@ -29331,16 +29519,19 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
if (Opc == ISD::ATOMIC_LOAD_SUB)
Addend = -Addend;
- if (CC == X86::COND_S && Addend == 1)
+ if (Comparand == -Addend) {
+ // No change to CC.
+ } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
CC = X86::COND_LE;
- else if (CC == X86::COND_NS && Addend == 1)
+ } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
CC = X86::COND_G;
- else if (CC == X86::COND_G && Addend == -1)
+ } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
CC = X86::COND_GE;
- else if (CC == X86::COND_LE && Addend == -1)
+ } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
CC = X86::COND_L;
- else
+ } else {
return SDValue();
+ }
SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
@@ -31083,10 +31274,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
/// Check if truncation with saturation form type \p SrcVT to \p DstVT
/// is valid for the given \p Subtarget.
-static bool
-isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+ const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX512())
return false;
+
+ // FIXME: Scalar type may be supported if we move it to vector register.
+ if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+ return false;
+
EVT SrcElVT = SrcVT.getScalarType();
EVT DstElVT = DstVT.getScalarType();
if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
@@ -31098,40 +31294,69 @@ isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
return false;
}
+/// Return true if VPACK* instruction can be used for the given types
+/// and it is avalable on \p Subtarget.
+static bool
+isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+ if (Subtarget.hasSSE2())
+ // v16i16 -> v16i8
+ if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8)
+ return true;
+ if (Subtarget.hasSSE41())
+ // v8i32 -> v8i16
+ if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16)
+ return true;
+ return false;
+}
+
/// Detect a pattern of truncation with saturation:
/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched or the unsupported on the current target.
-static SDValue
-detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
if (In.getOpcode() != ISD::UMIN)
return SDValue();
- EVT InVT = In.getValueType();
- // FIXME: Scalar type may be supported if we move it to vector register.
- if (!InVT.isVector() || !InVT.isSimple())
- return SDValue();
-
- if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
- return SDValue();
-
//Saturation with truncation. We truncate from InVT to VT.
- assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+ assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation");
- SDValue SrcVal;
APInt C;
- if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
- SrcVal = In.getOperand(1);
- else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
- SrcVal = In.getOperand(0);
- else
+ if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
+ // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+ // the element size of the destination type.
+ return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
+ SDValue();
+ }
+ return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+ const X86Subtarget &Subtarget) {
+ if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
return SDValue();
+ return detectUSatPattern(In, VT);
+}
- // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
- // the element size of the destination type.
- return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
- SrcVal : SDValue();
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue USatVal = detectUSatPattern(In, VT);
+ if (USatVal) {
+ if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL);
+ return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi);
+ }
+ }
+ return SDValue();
}
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
@@ -31701,7 +31926,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getMemOperand()->getFlags());
if (SDValue Val =
- detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+ detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
@@ -32326,9 +32551,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
- // Try the truncation with unsigned saturation.
- if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
- return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
+ // Try to combine truncation with unsigned saturation.
+ if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+ return Val;
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index d44d1395f243..230d1700b8d2 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5957,6 +5957,30 @@ let Predicates = [HasAVX512] in {
(VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
} // Predicates = [HasAVX512]
+// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
+// which produce unnecessary vmovs{s,d} instructions
+let Predicates = [HasAVX512] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+} // Predicates = [HasAVX512]
+
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
@@ -6136,6 +6160,21 @@ def : Pat<(f32 (fpround FR64X:$src)),
(COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
(COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
Requires<[HasAVX512]>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
//===----------------------------------------------------------------------===//
// AVX-512 Vector convert from signed/unsigned integer to float/double
// and from float/double to signed/unsigned integer
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index 09971d586a41..1812d01711d1 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
InstrItinClass ri = arg_ri;
}
-
// scalar
let Sched = WriteFAdd in {
def SSE_ALU_F32S : OpndItins<
@@ -1923,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
}
} // isCodeGenOnly = 1
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE1]
+
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
index 92c16214aa4a..d80dc4a9b5e8 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -216,7 +216,7 @@ protected:
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
- /// True if 16-bit divides are significantly faster than
+ /// True if 32-bit divides are significantly faster than
/// 64-bit divisions and should be used when possible.
bool HasSlowDivide64;
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 107ed9359376..5715d826862e 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry SLMCostTable[] = {
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ // v2i64/v4i64 mul is custom lowered as a series of long
+ // multiplies(3), shifts(3) and adds(2).
+ // slm muldq version throughput is 2
+ { ISD::MUL, MVT::v2i64, 11 },
+ };
+
+ if (ST->isSLM()) {
+ if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+ // Check if the operands can be shrinked into a smaller datatype.
+ bool Op1Signed = false;
+ unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+ bool Op2Signed = false;
+ unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+ bool signedMode = Op1Signed | Op2Signed;
+ unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+ if (OpMinSize <= 7)
+ return LT.first * 3; // pmullw/sext
+ if (!signedMode && OpMinSize <= 8)
+ return LT.first * 3; // pmullw/zext
+ if (OpMinSize <= 15)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ if (!signedMode && OpMinSize <= 16)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ }
+ if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+ LT.second)) {
+ return LT.first * Entry->Cost;
+ }
+ }
+
if (ISD == ISD::SDIV &&
Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -276,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+ { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
+
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
index c013805f4321..ecaaf951cff7 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -60,7 +60,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);