diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86.td | 41 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 10 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86ISelLowering.cpp | 387 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86InstrAVX512.td | 39 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86InstrSSE.td | 74 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86Subtarget.h | 2 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 57 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 |
8 files changed, 514 insertions, 99 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index dc18a59a30ba..83a23d4ad680 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", "Use 8-bit divide for positive values less than 256">; -def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl", "HasSlowDivide64", "true", - "Use 16-bit divide for positive values less than 65536">; + "Use 32-bit divide for positive values less than 2^32">; def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; @@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureCMPXCHG16B, FeaturePOPCNT, FeatureAES, + FeatureSlowDivide64, FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, @@ -760,6 +761,42 @@ def : Proc<"bdver4", [ FeatureMWAITX ]>; +// TODO: The scheduler model falls to BTVER2 model. +// The znver1 model has to be put in place. +// Zen +def: ProcessorModel<"znver1", BtVer2Model, [ + FeatureADX, + FeatureAES, + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureCLFLUSHOPT, + FeatureCMPXCHG16B, + FeatureF16C, + FeatureFMA, + FeatureFSGSBase, + FeatureFXSR, + FeatureFastLZCNT, + FeatureLAHFSAHF, + FeatureLZCNT, + FeatureMMX, + FeatureMOVBE, + FeatureMWAITX, + FeaturePCLMUL, + FeaturePOPCNT, + FeaturePRFCHW, + FeatureRDRAND, + FeatureRDSEED, + FeatureSHA, + FeatureSMAP, + FeatureSSE4A, + FeatureSlowSHLD, + FeatureX87, + FeatureXSAVE, + FeatureXSAVEC, + FeatureXSAVEOPT, + FeatureXSAVES]>; + def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 8b66790679d9..8ab4c0616880 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -183,16 +183,6 @@ namespace { void PreprocessISelDAG() override; - inline bool immSext8(SDNode *N) const { - return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); - } - - // True if the 64-bit immediate fits in a 32-bit sign-extended field. - inline bool i64immSExt32(SDNode *N) const { - uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); - return (int64_t)v == (int32_t)v; - } - // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index db76ddf04c06..787dff99367e 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass expensive divides on Atom when compiling with O2. + // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) - addBypassSlowDiv(64, 16); + addBypassSlowDiv(64, 32); } if (Subtarget.isTargetKnownWindowsMSVC() || @@ -1280,6 +1280,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); // FIXME. This commands are available on SSE/AVX2, add relevant patterns. setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -1306,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - if (Subtarget.hasDQI()) { - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); - } + for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); @@ -8090,6 +8089,37 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, return Zeroable; } +// The Shuffle result is as follow: +// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. +// Each Zeroable's element correspond to a particular Mask's element. +// As described in computeZeroableShuffleElements function. +// +// The function looks for a sub-mask that the nonzero elements are in +// increasing order. If such sub-mask exist. The function returns true. +static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, + ArrayRef<int> Mask,const EVT &VectorType, + bool &IsZeroSideLeft) { + int NextElement = -1; + // Check if the Mask's nonzero elements are in increasing order. + for (int i = 0, e = Zeroable.size(); i < e; i++) { + // Checks if the mask's zeros elements are built from only zeros. + if (Mask[i] == -1) + return false; + if (Zeroable[i]) + continue; + // Find the lowest non zero element + if (NextElement == -1) { + NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; + IsZeroSideLeft = NextElement != 0; + } + // Exit if the mask's non zero elements are not in increasing order. + if (NextElement != Mask[i]) + return false; + NextElement++; + } + return true; +} + /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, @@ -8145,6 +8175,46 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl); + +// Function convertBitVectorToUnsigned - The function gets SmallBitVector +// as argument and convert him to unsigned. +// The output of the function is not(zeroable) +static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { + unsigned convertBit = 0; + for (int i = 0, e = Zeroable.size(); i < e; i++) + convertBit |= !(Zeroable[i]) << i; + return convertBit; +} + +// X86 has dedicated shuffle that can be lowered to VEXPAND +static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, + const SmallBitVector &Zeroable, + ArrayRef<int> Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsLeftZeroSide = true; + if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), + IsLeftZeroSide)) + return SDValue(); + unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); + unsigned NumElts = VT.getVectorNumElements(); + assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && + "Unexpected number of vector elements"); + SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), + Subtarget, DAG, DL); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); + SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; + return DAG.getNode(ISD::VSELECT, DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, @@ -12159,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. @@ -12222,12 +12297,17 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or VEXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) @@ -12328,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. @@ -12392,12 +12477,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or EXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) @@ -12754,6 +12844,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12796,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12832,6 +12928,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } @@ -12889,6 +12989,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -12953,6 +13057,10 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -13089,9 +13197,9 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: @@ -15187,13 +15295,13 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) - return SDValue(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 && + (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - assert(InVT.getVectorElementType() == MVT::i1); + if (InVT.getVectorElementType() != MVT::i1) + return SDValue(); // Extend VT if the target is 256 or 128bit vector and VLX is not supported. MVT ExtVT = VT; @@ -15910,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, } } + // Sometimes flags can be set either with an AND or with an SRL/SHL + // instruction. SRL/SHL variant should be preferred for masks longer than this + // number of bits. + const int ShiftToAndMaxMaskWidth = 32; + const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE); + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. @@ -15958,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. - if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && + if (ZeroCheck && Op->hasOneUse() && isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); @@ -15968,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, APInt Mask = ArithOp.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); - if (!Mask.isSignedIntN(32)) // Avoid large immediates. + if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) break; Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, dl, VT)); @@ -15977,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. + // because a TEST instruction will be better. However, AND should be + // preferred if the instruction can be combined into ANDN. if (!hasNonFlagsUse(Op)) { SDValue Op0 = ArithOp->getOperand(0); SDValue Op1 = ArithOp->getOperand(1); EVT VT = ArithOp.getValueType(); bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1); bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64; + bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI(); + + // If we cannot select an ANDN instruction, check if we can replace + // AND+IMM64 with a shift before giving up. This is possible for masks + // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag. + if (!isProperAndn) { + if (!ZeroCheck) + break; + + assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized"); + auto *CN = dyn_cast<ConstantSDNode>(Op1); + if (!CN) + break; + + const APInt &Mask = CN->getAPIntValue(); + if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) + break; // Prefer TEST instruction. + + unsigned BitWidth = Mask.getBitWidth(); + unsigned LeadingOnes = Mask.countLeadingOnes(); + unsigned TrailingZeros = Mask.countTrailingZeros(); + + if (LeadingOnes + TrailingZeros == BitWidth) { + assert(TrailingZeros < VT.getSizeInBits() && + "Shift amount should be less than the type width"); + MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); + SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy); + Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt); + break; + } + + unsigned LeadingZeros = Mask.countLeadingZeros(); + unsigned TrailingOnes = Mask.countTrailingOnes(); + + if (LeadingZeros + TrailingOnes == BitWidth) { + assert(LeadingZeros < VT.getSizeInBits() && + "Shift amount should be less than the type width"); + MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); + SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy); + Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt); + break; + } - // But if we can combine this into an ANDN operation, then create an AND - // now and allow it to be pattern matched into an ANDN. - if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType) break; + } } LLVM_FALLTHROUGH; case ISD::SUB: @@ -16008,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { - if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + if (!NeedTruncation && ZeroCheck) { if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG)) return EFLAGS; } @@ -17283,17 +17438,20 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, unsigned NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) - return SDValue(); - - if (VT.is512BitVector() && InVTElt != MVT::i1) { + if (VT.is512BitVector() && InVTElt != MVT::i1 && + (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } - assert (InVTElt == MVT::i1 && "Unexpected vector type"); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + if (InVTElt != MVT::i1) + return SDValue(); + + MVT ExtVT = VT; + if (!VT.is512BitVector() && !Subtarget.hasVLX()) + ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + SDValue V; if (Subtarget.hasDQI()) { V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); @@ -17302,7 +17460,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); - if (VT.is512BitVector()) + if (ExtVT == VT) return V; } @@ -18418,13 +18576,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { - SDValue Op0 = ShAmt.getOperand(0); - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); - ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64); + ShAmt = ShAmt.getOperand(0); + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); + ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); + ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else { SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; @@ -21643,14 +21801,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } if (VT == MVT::v16i8 || - (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) { + (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || + (VT == MVT::v64i8 && Subtarget.hasBWI())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { - // On SSE41 targets we make use of the fact that VSELECT lowers - // to PBLENDVB which selects bytes based just on the sign bit. - if (Subtarget.hasSSE41()) { + if (VT.is512BitVector()) { + // On AVX512BW targets we make use of the fact that VSELECT lowers + // to a masked blend which selects bytes based just on the sign bit + // extracted to a mask. + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + V0 = DAG.getBitcast(VT, V0); + V1 = DAG.getBitcast(VT, V1); + Sel = DAG.getBitcast(VT, Sel); + Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); + return DAG.getBitcast(SelVT, + DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + } else if (Subtarget.hasSSE41()) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); @@ -28633,17 +28803,20 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() != ISD::VSELECT) return SDValue(); + assert(CondVT.isVector() && "Vector select expects a vector selector!"); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); - // Check if the first operand is all zeros.This situation only - // applies to avx512. - if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) { + // Check if the first operand is all zeros and Cond type is vXi1. + // This situation only applies to avx512. + if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && + CondVT.getVectorElementType() == MVT::i1) { //Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); + DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()), + DL, CondVT)); //Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); } - assert(CondVT.isVector() && "Vector select expects a vector selector!"); // To use the condition operand as a bitwise mask, it must have elements that // are the same size as the select elements. Ie, the condition operand must @@ -29282,11 +29455,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Combine: +/// Combine brcond/cmov/setcc/.. based on comparing the result of +/// atomic_load_add to use EFLAGS produced by the addition +/// directly if possible. For example: +/// +/// (setcc (cmp (atomic_load_add x, -C) C), COND_E) +/// becomes: +/// (setcc (LADD x, -C), COND_E) +/// +/// and /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) -/// to: +/// becomes: /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) -/// i.e., reusing the EFLAGS produced by the LOCKed instruction. +/// /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG) { @@ -29295,7 +29476,13 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); - // This only applies to variations of the common case: + // Can't replace the cmp if it has more uses than the one we're looking at. + // FIXME: We would like to be able to handle this, but would need to make sure + // all uses were updated. + if (!Cmp.hasOneUse()) + return SDValue(); + + // This applies to variations of the common case: // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) @@ -29314,8 +29501,9 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, return SDValue(); auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); - if (!CmpRHSC || CmpRHSC->getZExtValue() != 0) + if (!CmpRHSC) return SDValue(); + APInt Comparand = CmpRHSC->getAPIntValue(); const unsigned Opc = CmpLHS.getOpcode(); @@ -29331,16 +29519,19 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, if (Opc == ISD::ATOMIC_LOAD_SUB) Addend = -Addend; - if (CC == X86::COND_S && Addend == 1) + if (Comparand == -Addend) { + // No change to CC. + } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) { CC = X86::COND_LE; - else if (CC == X86::COND_NS && Addend == 1) + } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) { CC = X86::COND_G; - else if (CC == X86::COND_G && Addend == -1) + } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) { CC = X86::COND_GE; - else if (CC == X86::COND_LE && Addend == -1) + } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) { CC = X86::COND_L; - else + } else { return SDValue(); + } SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), @@ -31083,10 +31274,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, /// Check if truncation with saturation form type \p SrcVT to \p DstVT /// is valid for the given \p Subtarget. -static bool -isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { +static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, + const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return false; + + // FIXME: Scalar type may be supported if we move it to vector register. + if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512) + return false; + EVT SrcElVT = SrcVT.getScalarType(); EVT DstElVT = DstVT.getScalarType(); if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) @@ -31098,40 +31294,69 @@ isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { return false; } +/// Return true if VPACK* instruction can be used for the given types +/// and it is avalable on \p Subtarget. +static bool +isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { + if (Subtarget.hasSSE2()) + // v16i16 -> v16i8 + if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8) + return true; + if (Subtarget.hasSSE41()) + // v8i32 -> v8i16 + if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16) + return true; + return false; +} + /// Detect a pattern of truncation with saturation: /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// Return the source value to be truncated or SDValue() if the pattern was not -/// matched or the unsupported on the current target. -static SDValue -detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) { +/// matched. +static SDValue detectUSatPattern(SDValue In, EVT VT) { if (In.getOpcode() != ISD::UMIN) return SDValue(); - EVT InVT = In.getValueType(); - // FIXME: Scalar type may be supported if we move it to vector register. - if (!InVT.isVector() || !InVT.isSimple()) - return SDValue(); - - if (!isSATValidOnSubtarget(InVT, VT, Subtarget)) - return SDValue(); - //Saturation with truncation. We truncate from InVT to VT. - assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && + assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); - SDValue SrcVal; APInt C; - if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C)) - SrcVal = In.getOperand(1); - else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) - SrcVal = In.getOperand(0); - else + if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { + // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // the element size of the destination type. + return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) : + SDValue(); + } + return SDValue(); +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// The types should allow to use VPMOVUS* instruction on AVX512. +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, + const X86Subtarget &Subtarget) { + if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); + return detectUSatPattern(In, VT); +} - // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according - // the element size of the destination type. - return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ? - SrcVal : SDValue(); +static SDValue +combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue USatVal = detectUSatPattern(In, VT); + if (USatVal) { + if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL); + return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi); + } + } + return SDValue(); } /// This function detects the AVG pattern between vectors of unsigned i8/i16, @@ -31701,7 +31926,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); if (SDValue Val = - detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) + detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); @@ -32326,9 +32551,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; - // Try the truncation with unsigned saturation. - if (SDValue Val = detectUSatPattern(Src, VT, Subtarget)) - return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val); + // Try to combine truncation with unsigned saturation. + if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget)) + return Val; // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index d44d1395f243..230d1700b8d2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5957,6 +5957,30 @@ let Predicates = [HasAVX512] in { (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>; } // Predicates = [HasAVX512] +// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang +// which produce unnecessary vmovs{s,d} instructions +let Predicates = [HasAVX512] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; +} // Predicates = [HasAVX512] + // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, @@ -6136,6 +6160,21 @@ def : Pat<(f32 (fpround FR64X:$src)), (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 09971d586a41..1812d01711d1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, InstrItinClass ri = arg_ri; } - // scalar let Sched = WriteFAdd in { def SSE_ALU_F32S : OpndItins< @@ -1923,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, } } // isCodeGenOnly = 1 +// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and +// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary +// vmovs{s,d} instructions +let Predicates = [UseAVX] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseAVX] + +let Predicates = [UseSSE2] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE2] + +let Predicates = [UseSSE1] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE1] + // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index 92c16214aa4a..d80dc4a9b5e8 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -216,7 +216,7 @@ protected: /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; - /// True if 16-bit divides are significantly faster than + /// True if 32-bit divides are significantly faster than /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 107ed9359376..5715d826862e 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { } int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry SLMCostTable[] = { + { ISD::MUL, MVT::v4i32, 11 }, // pmulld + { ISD::MUL, MVT::v8i16, 2 }, // pmullw + { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. + { ISD::FMUL, MVT::f64, 2 }, // mulsd + { ISD::FMUL, MVT::v2f64, 4 }, // mulpd + { ISD::FMUL, MVT::v4f32, 2 }, // mulps + { ISD::FDIV, MVT::f32, 17 }, // divss + { ISD::FDIV, MVT::v4f32, 39 }, // divps + { ISD::FDIV, MVT::f64, 32 }, // divsd + { ISD::FDIV, MVT::v2f64, 69 }, // divpd + { ISD::FADD, MVT::v2f64, 2 }, // addpd + { ISD::FSUB, MVT::v2f64, 2 }, // subpd + // v2i64/v4i64 mul is custom lowered as a series of long + // multiplies(3), shifts(3) and adds(2). + // slm muldq version throughput is 2 + { ISD::MUL, MVT::v2i64, 11 }, + }; + + if (ST->isSLM()) { + if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { + // Check if the operands can be shrinked into a smaller datatype. + bool Op1Signed = false; + unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); + bool Op2Signed = false; + unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); + + bool signedMode = Op1Signed | Op2Signed; + unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); + + if (OpMinSize <= 7) + return LT.first * 3; // pmullw/sext + if (!signedMode && OpMinSize <= 8) + return LT.first * 3; // pmullw/zext + if (OpMinSize <= 15) + return LT.first * 5; // pmullw/pmulhw/pshuf + if (!signedMode && OpMinSize <= 16) + return LT.first * 5; // pmullw/pmulhw/pshuf + } + if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, + LT.second)) { + return LT.first * Entry->Cost; + } + } + if (ISD == ISD::SDIV && Op2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { @@ -276,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw { ISD::SRA, MVT::v32i16, 1 }, // vpsravw + { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. + { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. + { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h index c013805f4321..ecaaf951cff7 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -60,7 +60,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); |