diff options
Diffstat (limited to 'contrib/llvm/lib/Target')
53 files changed, 1991 insertions, 585 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index e927d58ad612..d472a54d9543 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -18,9 +18,132 @@ namespace llvm { namespace AArch64 { -RegisterBank GPRRegBank; -RegisterBank FPRRegBank; -RegisterBank CCRRegBank; +const uint32_t GPRCoverageData[] = { + // Classes 0-31 + (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) | + (1u << AArch64::GPR32spRegClassID) | + (1u << AArch64::GPR32commonRegClassID) | + (1u << AArch64::GPR32sponlyRegClassID) | + (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) | + (1u << AArch64::GPR64spRegClassID) | + (1u << AArch64::GPR64commonRegClassID) | + (1u << AArch64::tcGPR64RegClassID) | + (1u << AArch64::GPR64sponlyRegClassID), + // Classes 32-63 + 0, + // FIXME: The entries below this point can be safely removed once this is + // tablegenerated. It's only needed because of the hardcoded register class + // limit. + // Classes 64-96 + 0, + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +const uint32_t FPRCoverageData[] = { + // Classes 0-31 + (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) | + (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) | + (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) | + (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) | + (1u << AArch64::DDDDRegClassID), + // Classes 32-63 + (1u << (AArch64::QQRegClassID - 32)) | + (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) | + (1u + << (AArch64:: + QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u << (AArch64::QQQQRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID - + 32)) | + (1u << (AArch64::QQQRegClassID - 32)) | + (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | + (1u + << (AArch64:: + QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID - + 32)), + // FIXME: The entries below this point can be safely removed once this + // is tablegenerated. It's only needed because of the hardcoded register + // class limit. + // Classes 64-96 + 0, + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +const uint32_t CCRCoverageData[] = { + // Classes 0-31 + 1u << AArch64::CCRRegClassID, + // Classes 32-63 + 0, + // FIXME: The entries below this point can be safely removed once this + // is tablegenerated. It's only needed because of the hardcoded register + // class limit. + // Classes 64-96 + 0, + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData); +RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData); +RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData); RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank}; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 74a01835171b..7b581a706fa2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -159,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c5b95f282ea8..2244baacca17 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -951,10 +951,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; defm CLS : OneOperandData<0b101, "cls">; defm CLZ : OneOperandData<0b100, "clz", ctlz>; -defm RBIT : OneOperandData<0b000, "rbit">; - -def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>; -def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>; +defm RBIT : OneOperandData<0b000, "rbit", bitreverse>; def REV16Wr : OneWRegData<0b001, "rev16", UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index a5fd2fbdde19..b292c9c87dcd 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -41,28 +41,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) if (AlreadyInit) return; AlreadyInit = true; - // Initialize the GPR bank. - createRegisterBank(AArch64::GPRRegBankID, "GPR"); - // The GPR register bank is fully defined by all the registers in - // GR64all + its subclasses. - addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI); + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); (void)RBGPR; assert(&AArch64::GPRRegBank == &RBGPR && "The order in RegBanks is messed up"); + + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(&AArch64::FPRRegBank == &RBFPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); + (void)RBCCR; + assert(&AArch64::CCRRegBank == &RBCCR && + "The order in RegBanks is messed up"); + + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && "Subclass not added?"); assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); - // Initialize the FPR bank. - createRegisterBank(AArch64::FPRRegBankID, "FPR"); // The FPR register bank is fully defined by all the registers in // GR64all + its subclasses. - addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI); - const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); - (void)RBFPR; - assert(&AArch64::FPRRegBank == &RBFPR && - "The order in RegBanks is messed up"); assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && "Subclass not added?"); assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && @@ -70,13 +72,6 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) assert(RBFPR.getSize() == 512 && "FPRs should hold up to 512-bit via QQQQ sequence"); - // Initialize the CCR bank. - createRegisterBank(AArch64::CCRRegBankID, "CCR"); - addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI); - const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); - (void)RBCCR; - assert(&AArch64::CCRRegBank == &RBCCR && - "The order in RegBanks is messed up"); assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && "Class not added?"); assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1a17691fc584..b8833e5a5552 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -374,7 +374,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -466,28 +466,27 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, +int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, unsigned Alignment, unsigned AddressSpace) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && - Src->isVectorTy() && Alignment != 16 && - Src->getVectorElementType()->isIntegerTy(64)) { - // Unaligned stores are extremely inefficient. We don't split - // unaligned v2i64 stores because the negative impact that has shown in - // practice on inlined memcpy code. - // We make v2i64 stores expensive so that we will only vectorize if there + LT.second.is128BitVector() && Alignment < 16) { + // Unaligned stores are extremely inefficient. We don't split all + // unaligned 128-bit stores because the negative impact that has shown in + // practice on inlined block copy code. + // We make such stores expensive so that we will only vectorize if there // are 6 other instructions getting vectorized. - int AmortizationCost = 6; + const int AmortizationCost = 6; return LT.first * 2 * AmortizationCost; } - if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && - Src->getVectorNumElements() < 8) { + if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && + Ty->getVectorNumElements() < 8) { // We scalarize the loads/stores because there is not v.4b register and we // have to promote the elements to v.4h. - unsigned NumVecElts = Src->getVectorNumElements(); + unsigned NumVecElts = Ty->getVectorNumElements(); unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; // We generate 2 instructions per vector element. return NumVectorizableInstsToAmortize * NumVecElts * 2; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 849fd3d9b44a..18287ed6653f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -102,7 +102,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 730bcdcf7afa..e48c1943cb01 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -434,6 +434,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + + // FIXME: This is only partially true. If we have to do vector compares, any + // SGPR pair can be a condition register. If we have a uniform condition, we + // are better off doing SALU operations, where there is only one SCC. For now, + // we don't have a way of knowing during instruction selection if a condition + // will be uniform and we always use vector compares. Assume we are using + // vector compares until that is fixed. setHasMultipleConditionRegisters(true); // SI at least has hardware support for floating point exceptions, but no way @@ -470,12 +477,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); } //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// +static bool fnegFoldsIntoOp(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FMA: + case ISD::FMAD: + case ISD::FSIN: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMUL_LEGACY: + return true; + default: + return false; + } +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -2679,8 +2705,93 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, return SDValue(); } +static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, + unsigned Op, + const SDLoc &SL, + SDValue Cond, + SDValue N1, + SDValue N2) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N1.getValueType(); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, + N1.getOperand(0), N2.getOperand(0)); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(Op, SL, VT, NewSelect); +} + +// Pull a free FP operation out of a select so it may fold into uses. +// +// select c, (fneg x), (fneg y) -> fneg (select c, x, y) +// select c, (fneg x), k -> fneg (select c, x, (fneg k)) +// +// select c, (fabs x), (fabs y) -> fabs (select c, x, y) +// select c, (fabs x), +k -> fabs (select c, x, k) +static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) { + SelectionDAG &DAG = DCI.DAG; + SDValue Cond = N.getOperand(0); + SDValue LHS = N.getOperand(1); + SDValue RHS = N.getOperand(2); + + EVT VT = N.getValueType(); + if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || + (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + return distributeOpThroughSelect(DCI, LHS.getOpcode(), + SDLoc(N), Cond, LHS, RHS); + } + + bool Inv = false; + if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { + std::swap(LHS, RHS); + Inv = true; + } + + // TODO: Support vector constants. + ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { + SDLoc SL(N); + // If one side is an fneg/fabs and the other is a constant, we can push the + // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. + SDValue NewLHS = LHS.getOperand(0); + SDValue NewRHS = RHS; + + // Careful: if the neg can be folded up, don't try to pull it back down. + bool ShouldFoldNeg = true; + + if (NewLHS.hasOneUse()) { + unsigned Opc = NewLHS.getOpcode(); + if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) + ShouldFoldNeg = false; + if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) + ShouldFoldNeg = false; + } + + if (ShouldFoldNeg) { + if (LHS.getOpcode() == ISD::FNEG) + NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else if (CRHS->isNegative()) + return SDValue(); + + if (Inv) + std::swap(NewLHS, NewRHS); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, + Cond, NewLHS, NewRHS); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); + } + } + + return SDValue(); +} + + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); @@ -2724,6 +2835,129 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } +SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Opc = N0.getOpcode(); + + // If the input has multiple uses and we can either fold the negate down, or + // the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to fold around + // a negate that has no 'good' form. + // + // TODO: Check users can fold + if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) + return SDValue(); + + SDLoc SL(N); + switch (Opc) { + case ISD::FADD: { + // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() != ISD::FNEG) + LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + else + LHS = LHS.getOperand(0); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMUL: + case AMDGPUISD::FMUL_LEGACY: { + // (fneg (fmul x, y)) -> (fmul x, (fneg y)) + // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (RHS.getOpcode() == ISD::FNEG) + RHS = RHS.getOperand(0); + else + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMA: + case ISD::FMAD: { + // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) + SDValue LHS = N0.getOperand(0); + SDValue MHS = N0.getOperand(1); + SDValue RHS = N0.getOperand(2); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (MHS.getOpcode() == ISD::FNEG) + MHS = MHS.getOperand(0); + else + MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FP_EXTEND: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case ISD::FSIN: + case AMDGPUISD::SIN_HW: { + SDValue CvtSrc = N0.getOperand(0); + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_extend (fneg x))) -> (fp_extend x) + // (fneg (rcp (fneg x))) -> (rcp x) + return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_extend x)) -> (fp_extend (fneg x)) + // (fneg (rcp x)) -> (rcp (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(Opc, SL, VT, Neg); + } + case ISD::FP_ROUND: { + SDValue CvtSrc = N0.getOperand(0); + + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_round (fneg x))) -> (fp_round x) + return DAG.getNode(ISD::FP_ROUND, SL, VT, + CvtSrc.getOperand(0), N0.getOperand(1)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_round x)) -> (fp_round (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); + } + default: + return SDValue(); + } +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2829,6 +3063,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performMulLoHi24Combine(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); + case ISD::FNEG: + return performFNegCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 745c9923de2e..69567aa5f713 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -84,6 +84,7 @@ protected: SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 513df3a9cdf3..59cba636c586 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -629,9 +629,10 @@ def smax_oneuse : HasOneUseBinOp<smax>; def smin_oneuse : HasOneUseBinOp<smin>; def umax_oneuse : HasOneUseBinOp<umax>; def umin_oneuse : HasOneUseBinOp<umin>; -def sub_oneuse : HasOneUseBinOp<sub>; } // Properties = [SDNPCommutative, SDNPAssociative] +def sub_oneuse : HasOneUseBinOp<sub>; + def select_oneuse : HasOneUseTernaryOp<select>; // Special conversion patterns diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index a1a352642242..e90487065992 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 1177007644ff..0d83b2a585bf 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -83,7 +83,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); unsigned getCFInstrCost(unsigned Opcode); diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index da9d009c542b..3cf9a1d92469 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -214,7 +214,7 @@ public: } bool isReg() const override { - return isRegKind() && !Reg.Mods.hasModifiers(); + return isRegKind() && !hasModifiers(); } bool isRegOrImmWithInputMods(MVT type) const { @@ -245,6 +245,15 @@ public: return isRegOrImmWithInputMods(MVT::f64); } + bool isVReg() const { + return isRegClass(AMDGPU::VGPR_32RegClassID) || + isRegClass(AMDGPU::VReg_64RegClassID) || + isRegClass(AMDGPU::VReg_96RegClassID) || + isRegClass(AMDGPU::VReg_128RegClassID) || + isRegClass(AMDGPU::VReg_256RegClassID) || + isRegClass(AMDGPU::VReg_512RegClassID); + } + bool isVReg32OrOff() const { return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); } @@ -299,28 +308,32 @@ public: bool isRegClass(unsigned RCID) const; + bool isRegOrInlineNoMods(unsigned RCID, MVT type) const { + return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers(); + } + bool isSCSrcB16() const { - return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16); + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16); } bool isSCSrcB32() const { - return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32); + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32); } bool isSCSrcB64() const { - return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64); + return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64); } bool isSCSrcF16() const { - return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16); + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } bool isSCSrcF32() const { - return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32); + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32); } bool isSCSrcF64() const { - return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64); + return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64); } bool isSSrcB32() const { @@ -350,27 +363,27 @@ public: } bool isVCSrcB32() const { - return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32); + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } bool isVCSrcB64() const { - return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64); + return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64); } bool isVCSrcB16() const { - return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16); + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16); } bool isVCSrcF32() const { - return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32); + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32); } bool isVCSrcF64() const { - return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64); + return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } bool isVCSrcF16() const { - return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16); + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16); } bool isVSrcB32() const { @@ -534,6 +547,23 @@ public: addRegOrImmWithInputModsOperands(Inst, N); } + void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { + Modifiers Mods = getModifiers(); + Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand())); + assert(isRegKind()); + addRegOperands(Inst, N); + } + + void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasIntModifiers()); + addRegWithInputModsOperands(Inst, N); + } + + void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasFPModifiers()); + addRegWithInputModsOperands(Inst, N); + } + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { if (isImm()) addImmOperands(Inst, N); @@ -852,9 +882,12 @@ public: StringRef &Value); OperandMatchResultTy parseImm(OperandVector &Operands); + OperandMatchResultTy parseReg(OperandVector &Operands); OperandMatchResultTy parseRegOrImm(OperandVector &Operands); - OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands); - OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands); + OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); + OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); + OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); + OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands); OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); @@ -1057,7 +1090,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { } bool AMDGPUOperand::isRegClass(unsigned RCID) const { - return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); + return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { @@ -1468,23 +1501,28 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { - auto res = parseImm(Operands); - if (res != MatchOperand_NoMatch) { - return res; - } - +AMDGPUAsmParser::parseReg(OperandVector &Operands) { if (auto R = parseRegister()) { assert(R->isReg()); R->Reg.IsForcedVOP3 = isForcedVOP3(); Operands.push_back(std::move(R)); return MatchOperand_Success; } - return MatchOperand_ParseFail; + return MatchOperand_NoMatch; } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { + auto res = parseImm(Operands); + if (res != MatchOperand_NoMatch) { + return res; + } + + return parseReg(Operands); +} + +OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { // XXX: During parsing we can't determine if minus sign means // negate-modifier or negative immediate value. // By default we suppose it is modifier. @@ -1514,7 +1552,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { Abs = true; } - auto Res = parseRegOrImm(Operands); + OperandMatchResultTy Res; + if (AllowImm) { + Res = parseRegOrImm(Operands); + } else { + Res = parseReg(Operands); + } if (Res != MatchOperand_Success) { return Res; } @@ -1548,7 +1591,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { +AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { bool Sext = false; if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") { @@ -1561,7 +1604,12 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { Parser.Lex(); } - auto Res = parseRegOrImm(Operands); + OperandMatchResultTy Res; + if (AllowImm) { + Res = parseRegOrImm(Operands); + } else { + Res = parseReg(Operands); + } if (Res != MatchOperand_Success) { return Res; } @@ -1584,6 +1632,16 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { return MatchOperand_Success; } +OperandMatchResultTy +AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) { + return parseRegOrImmWithFPInputMods(Operands, false); +} + +OperandMatchResultTy +AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) { + return parseRegOrImmWithIntInputMods(Operands, false); +} + OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) { std::unique_ptr<AMDGPUOperand> Reg = parseRegister(); if (Reg) { @@ -3382,7 +3440,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { // Skip it. continue; } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + Op.addRegWithFPInputModsOperands(Inst, 2); } else if (Op.isDPPCtrl()) { Op.addImmOperands(Inst, 1); } else if (Op.isImm()) { @@ -3508,7 +3566,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, // Skip it. continue; } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegOrImmWithInputModsOperands(Inst, 2); + Op.addRegWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments OptionalIdx[Op.getImmTy()] = I; diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 4112ad100584..48c6592ca5b2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -333,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; -def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; +def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>; +def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 831ac5948a68..a5c0d4923d6b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -25,25 +25,6 @@ using namespace llvm; namespace { -class SIFoldOperands : public MachineFunctionPass { -public: - static char ID; - -public: - SIFoldOperands() : MachineFunctionPass(ID) { - initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fold Operands"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - struct FoldCandidate { MachineInstr *UseMI; union { @@ -79,6 +60,36 @@ struct FoldCandidate { } }; +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + + void foldOperand(MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; + + void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Fold Operands"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // End anonymous namespace. INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, @@ -88,6 +99,34 @@ char SIFoldOperands::ID = 0; char &llvm::SIFoldOperandsID = SIFoldOperands::ID; +// Wrapper around isInlineConstant that understands special cases when +// instruction types are replaced during operand folding. +static bool isInlineConstantIfFolded(const SIInstrInfo *TII, + const MachineInstr &UseMI, + unsigned OpNo, + const MachineOperand &OpToFold) { + if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) + return true; + + unsigned Opc = UseMI.getOpcode(); + switch (Opc) { + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAC_F16_e64: { + // Special case for mac. Since this is replaced with mad when folded into + // src2, we need to check the legality for the final instruction. + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (static_cast<int>(OpNo) == Src2Idx) { + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + const MCInstrDesc &MadDesc + = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); + } + } + default: + return false; + } +} + FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } @@ -141,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold, return false; } -static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, +static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, const MachineInstr *MI) { for (auto Candidate : FoldList) { if (Candidate.UseMI == MI) @@ -150,7 +189,7 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, return false; } -static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, +static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { @@ -160,7 +199,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. @@ -227,12 +266,12 @@ static bool isUseSafeToFold(const MachineInstr &MI, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } -static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, - std::vector<FoldCandidate> &FoldList, - SmallVectorImpl<MachineInstr *> &CopiesToReplace, - const SIInstrInfo *TII, const SIRegisterInfo &TRI, - MachineRegisterInfo &MRI) { +void SIFoldOperands::foldOperand( + MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (!isUseSafeToFold(*UseMI, UseOp)) @@ -264,7 +303,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); for (MachineRegisterInfo::use_iterator - RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end(); + RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); RSUse != RSE; ++RSUse) { MachineInstr *RSUseMI = RSUse->getParent(); @@ -272,7 +311,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, continue; foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, - CopiesToReplace, TII, TRI, MRI); + CopiesToReplace); } return; @@ -287,8 +326,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); + MRI->getRegClass(DestReg) : + TRI->getPhysRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) @@ -318,7 +357,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); const TargetRegisterClass *FoldRC = - TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), OpToFold.getImm()); @@ -328,8 +367,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); + MRI->getRegClass(UseReg) : + TRI->getPhysRegClass(UseReg); assert(Imm.getBitWidth() == 64); @@ -349,20 +388,51 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, - int32_t LHS, int32_t RHS) { + uint32_t LHS, uint32_t RHS) { switch (Opcode) { case AMDGPU::V_AND_B32_e64: + case AMDGPU::V_AND_B32_e32: case AMDGPU::S_AND_B32: Result = LHS & RHS; return true; case AMDGPU::V_OR_B32_e64: + case AMDGPU::V_OR_B32_e32: case AMDGPU::S_OR_B32: Result = LHS | RHS; return true; case AMDGPU::V_XOR_B32_e64: + case AMDGPU::V_XOR_B32_e32: case AMDGPU::S_XOR_B32: Result = LHS ^ RHS; return true; + case AMDGPU::V_LSHL_B32_e64: + case AMDGPU::V_LSHL_B32_e32: + case AMDGPU::S_LSHL_B32: + // The instruction ignores the high bits for out of bounds shifts. + Result = LHS << (RHS & 31); + return true; + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32: + Result = RHS << (LHS & 31); + return true; + case AMDGPU::V_LSHR_B32_e64: + case AMDGPU::V_LSHR_B32_e32: + case AMDGPU::S_LSHR_B32: + Result = LHS >> (RHS & 31); + return true; + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32: + Result = RHS >> (LHS & 31); + return true; + case AMDGPU::V_ASHR_I32_e64: + case AMDGPU::V_ASHR_I32_e32: + case AMDGPU::S_ASHR_I32: + Result = static_cast<int32_t>(LHS) >> (RHS & 31); + return true; + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32: + Result = static_cast<int32_t>(RHS) >> (LHS & 31); + return true; default: return false; } @@ -390,33 +460,47 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { stripExtraCopyOperands(MI); } +static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, + MachineOperand &Op) { + if (Op.isReg()) { + // If this has a subregister, it obviously is a register source. + if (Op.getSubReg() != AMDGPU::NoSubRegister) + return &Op; + + MachineInstr *Def = MRI.getVRegDef(Op.getReg()); + if (Def->isMoveImmediate()) { + MachineOperand &ImmSrc = Def->getOperand(1); + if (ImmSrc.isImm()) + return &ImmSrc; + } + } + + return &Op; +} + // Try to simplify operations with a constant that may appear after instruction // selection. +// TODO: See if a frame index with a fixed offset can fold. static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, - MachineInstr *MI) { + MachineInstr *MI, + MachineOperand *ImmOp) { unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || Opc == AMDGPU::S_NOT_B32) { - MachineOperand &Src0 = MI->getOperand(1); - if (Src0.isImm()) { - Src0.setImm(~Src0.getImm()); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); - return true; - } - - return false; + MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); + mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + return true; } - if (!MI->isCommutable()) + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) return false; int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); + MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); - MachineOperand *Src0 = &MI->getOperand(Src0Idx); - MachineOperand *Src1 = &MI->getOperand(Src1Idx); if (!Src0->isImm() && !Src1->isImm()) return false; @@ -431,19 +515,26 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI = TII->getRegisterInfo(); bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); - Src0->setImm(NewImm); + // Be careful to change the right operand, src0 may belong to a different + // instruction. + MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); return true; } + if (!MI->isCommutable()) + return false; + if (Src0->isImm() && !Src1->isImm()) { std::swap(Src0, Src1); std::swap(Src0Idx, Src1Idx); } int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); - if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { + if (Opc == AMDGPU::V_OR_B32_e64 || + Opc == AMDGPU::V_OR_B32_e32 || + Opc == AMDGPU::S_OR_B32) { if (Src1Val == 0) { // y = or x, 0 => y = copy x MI->RemoveOperand(Src1Idx); @@ -459,6 +550,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, } if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || + MI->getOpcode() == AMDGPU::V_AND_B32_e32 || MI->getOpcode() == AMDGPU::S_AND_B32) { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 @@ -476,29 +568,136 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, } if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || + MI->getOpcode() == AMDGPU::V_XOR_B32_e32 || MI->getOpcode() == AMDGPU::S_XOR_B32) { if (Src1Val == 0) { // y = xor x, 0 => y = copy x MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + return true; } } return false; } +void SIFoldOperands::foldInstOperand(MachineInstr &MI, + MachineOperand &OpToFold) const { + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector<MachineInstr *, 4> CopiesToReplace; + SmallVector<FoldCandidate, 4> FoldList; + MachineOperand &Dst = MI.getOperand(0); + + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + if (FoldingImm) { + unsigned NumLiteralUses = 0; + MachineOperand *NonInlineUse = nullptr; + int NonInlineUseOpNo = -1; + + MachineRegisterInfo::use_iterator NextUse, NextInstUse; + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + MachineInstr *UseMI = Use->getParent(); + unsigned OpNo = Use.getOperandNo(); + + // Folding the immediate may reveal operations that can be constant + // folded or replaced with a copy. This can happen for example after + // frame indices are lowered to constants or from splitting 64-bit + // constants. + // + // We may also encounter cases where one or both operands are + // immediates materialized into a register, which would ordinarily not + // be folded due to multiple uses or operand constraints. + + if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { + DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); + + // Some constant folding cases change the same immediate's use to a new + // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user + // again. The same constant folded instruction could also have a second + // use operand. + NextUse = MRI->use_begin(Dst.getReg()); + continue; + } + + // Try to fold any inline immediate uses, and then only fold other + // constants if they have one use. + // + // The legality of the inline immediate must be checked based on the use + // operand, not the defining instruction, because 32-bit instructions + // with 32-bit inline immediate sources may be used to materialize + // constants used in 16-bit operands. + // + // e.g. it is unsafe to fold: + // s_mov_b32 s0, 1.0 // materializes 0x3f800000 + // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else { + if (++NumLiteralUses == 1) { + NonInlineUse = &*Use; + NonInlineUseOpNo = OpNo; + } + } + } + + if (NumLiteralUses == 1) { + MachineInstr *UseMI = NonInlineUse->getParent(); + foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); + } + } else { + // Folding register. + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); + Use != E; ++Use) { + MachineInstr *UseMI = Use->getParent(); + + foldOperand(OpToFold, UseMI, Use.getOperandNo(), + FoldList, CopiesToReplace); + } + } + + MachineFunction *MF = MI.getParent()->getParent(); + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(*MF); + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, *TRI)) { + // Clear kill flags. + if (Fold.isReg()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI->clearKillFlags(Fold.OpToFold->getReg()); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + } + } +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; @@ -512,8 +711,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); - // FIXME: We could also be folding things like FrameIndexes and - // TargetIndexes. + // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; @@ -532,90 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; - // We need mutate the operands of new mov instructions to add implicit - // uses of EXEC, but adding them invalidates the use_iterator, so defer - // this. - SmallVector<MachineInstr *, 4> CopiesToReplace; - - std::vector<FoldCandidate> FoldList; - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end(); - Use != E; ++Use) { - MachineInstr *UseMI = Use->getParent(); - unsigned OpNo = Use.getOperandNo(); - - if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, - CopiesToReplace, TII, TRI, MRI); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &*Use; - NonInlineUseOpNo = OpNo; - } - } - } - - if (NumLiteralUses == 1) { - MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, - CopiesToReplace, TII, TRI, MRI); - } - } else { - // Folding register. - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end(); - Use != E; ++Use) { - MachineInstr *UseMI = Use->getParent(); - - foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, - CopiesToReplace, TII, TRI, MRI); - } - } - - // Make sure we add EXEC uses to any new v_mov instructions created. - for (MachineInstr *Copy : CopiesToReplace) - Copy->addImplicitDefUseOperands(MF); - - for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, TRI)) { - // Clear kill flags. - if (Fold.isReg()) { - assert(Fold.OpToFold && Fold.OpToFold->isReg()); - // FIXME: Probably shouldn't bother trying to fold if not an - // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR - // copies. - MRI.clearKillFlags(Fold.OpToFold->getReg()); - } - DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << - static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); - - // Folding the immediate may reveal operations that can be constant - // folded or replaced with a copy. This can happen for example after - // frame indices are lowered to constants or from splitting 64-bit - // constants. - tryConstantFoldOp(MRI, TII, Fold.UseMI); - } - } + foldInstOperand(MI, OpToFold); } } return false; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 34096e158039..ebaefae3bfef 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -557,6 +557,27 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +def FPVRegInputModsMatchClass : AsmOperandClass { + let Name = "VRegWithFPInputMods"; + let ParserMethod = "parseRegWithFPInputMods"; + let PredicateMethod = "isVReg"; +} + +def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { + let PrintMethod = "printOperandAndFPInputMods"; +} + +def IntVRegInputModsMatchClass : AsmOperandClass { + let Name = "VRegWithIntInputMods"; + let ParserMethod = "parseRegWithIntInputMods"; + let PredicateMethod = "isVReg"; +} + +def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> { + let PrintMethod = "printOperandAndIntInputMods"; +} + + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// @@ -761,6 +782,15 @@ class getSrcMod <ValueType VT> { ); } +// Return type of input modifiers operand specified input operand for SDWA/DPP +class getSrcModExt <ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); +} + // Returns the input arguments for VOP[12C] instructions for the given SrcVT. class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 @@ -1001,6 +1031,11 @@ class VOPProfile <list<ValueType> _ArgVT> { field Operand Src0Mod = getSrcMod<Src0VT>.ret; field Operand Src1Mod = getSrcMod<Src1VT>.ret; field Operand Src2Mod = getSrcMod<Src2VT>.ret; + field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret; + field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; + field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; + field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); field bit HasDst32 = HasDst; @@ -1038,15 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; - field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; + field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod>.ret; + HasModifiers, Src0ModDPP, Src1ModDPP>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod, DstVT>.ret; + HasModifiers, Src0ModSDWA, Src1ModSDWA, + DstVT>.ret; field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index bc35c2edc8d3..b86c04191189 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -871,6 +871,11 @@ def : Pat < >; def : Pat < + (i16 (sext_inreg i16:$src, i1)), + (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 +>; + +def : Pat < (i16 (sext_inreg i16:$src, i8)), (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 >; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index b27d7c691032..dd31dc690840 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -84,12 +84,17 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add // a special case for it. It can only be shrunk if the third operand // is vcc. We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrining + // a register allocation hint pre-regalloc and then do the shrinking // post-regalloc. if (Src2) { switch (MI.getOpcode()) { default: return false; + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + // Additional verification is needed for sdst/src2. + return true; + case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: if (!isVGPR(Src2, TRI, MRI) || @@ -174,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig) { for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.getReg() == AMDGPU::VCC) { + if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { Use.setIsUndef(Orig.isUndef()); Use.setIsKill(Orig.isKill()); return; @@ -456,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + // Check for the bool flag output for instructions like V_ADD_I32_e64. + const MachineOperand *SDst = TII->getNamedOperand(MI, + AMDGPU::OpName::sdst); + + // Check the carry-in operand for v_addc_u32_e64. + const MachineOperand *Src2 = TII->getNamedOperand(MI, + AMDGPU::OpName::src2); + + if (SDst) { + if (SDst->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) + MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); + continue; + } + + // All of the instructions with carry outs also have an SGPR input in + // src2. + if (Src2 && Src2->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) + MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); + + continue; + } + } + // We can shrink this instruction DEBUG(dbgs() << "Shrinking " << MI); @@ -481,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src1) Inst32.addOperand(*Src1); - const MachineOperand *Src2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td index bff706cdc1dc..a15b9ceff2f4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -232,7 +232,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0, + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 20fb7f7bcab7..00e5ab3db0b7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -183,13 +183,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0, - FP32InputMods:$src1_modifiers, Src1DPP:$src1, + let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0, - FP32InputMods:$src1_modifiers, Src1SDWA:$src1, + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, VGPR_32:$src2, // stub argument clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td index c431d9db801e..16a456da3c67 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -517,8 +517,8 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : VOPC_Profile<sched, vt, i32> { let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$sdst, $src0_modifiers, $src1"; - let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, - Int32InputMods:$src1_modifiers, Src1RC64:$src1, + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; let HasSrc1Mods = 0; diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index afba1587a743..32b7c87e61bb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -608,15 +608,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have // a __gnu_ prefix (which is the default). if (Subtarget->isTargetAEABI()) { - setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); - setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, + { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, + { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + } } if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else addRegisterClass(MVT::i32, &ARM::GPRRegClass); + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); @@ -976,6 +988,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); + // Register based DivRem for AEABI (RTABI 4.2) if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || @@ -984,29 +997,49 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UREM, MVT::i64, Custom); HasStandaloneRem = false; - for (const auto &LC : - {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32}) - setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv" - : "__aeabi_idivmod"); - setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows() - ? "__rt_sdiv64" - : "__aeabi_ldivmod"); - for (const auto &LC : - {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32}) - setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv" - : "__aeabi_uidivmod"); - setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows() - ? "__rt_udiv64" - : "__aeabi_uldivmod"); - - setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); + if (Subtarget->isTargetWindows()) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, + { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, + { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, + { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, + + { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, + { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, + { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, + { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + } + } else { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, + { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, + { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, + { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, + + { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, + { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, + { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, + { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + } + } setOperationAction(ISD::SDIVREM, MVT::i32, Custom); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); @@ -3305,11 +3338,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::arm_rbit: { - assert(Op.getOperand(1).getValueType() == MVT::i32 && - "RBIT intrinsic must have i32 type!"); - return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); - } case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); @@ -9232,12 +9260,102 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, return SDValue(); } -// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction -// (only after legalization). -static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, +static bool IsVUZPShuffleNode(SDNode *N) { + // VUZP shuffle node. + if (N->getOpcode() == ARMISD::VUZP) + return true; + + // "VUZP" on i32 is an alias for VTRN. + if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) + return true; + + return false; +} + +static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { + // Look for ADD(VUZP.0, VUZP.1). + if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || + N0 == N1) + return SDValue(); + + // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. + if (!N->getValueType(0).is64BitVector()) + return SDValue(); + // Generate vpadd. + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc dl(N); + SDNode *Unzip = N0.getNode(); + EVT VT = N->getValueType(0); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + Ops.push_back(Unzip->getOperand(0)); + Ops.push_back(Unzip->getOperand(1)); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); +} + +static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Check for two extended operands. + if (!(N0.getOpcode() == ISD::SIGN_EXTEND && + N1.getOpcode() == ISD::SIGN_EXTEND) && + !(N0.getOpcode() == ISD::ZERO_EXTEND && + N1.getOpcode() == ISD::ZERO_EXTEND)) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + + // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) + if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || + N00 == N10) + return SDValue(); + + // We only recognize Q register paddl here; this can't be reached until + // after type legalization. + if (!N00.getValueType().is64BitVector() || + !N0.getValueType().is128BitVector()) + return SDValue(); + + // Generate vpaddl. + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc dl(N); + EVT VT = N->getValueType(0); + + SmallVector<SDValue, 8> Ops; + // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. + unsigned Opcode; + if (N0.getOpcode() == ISD::SIGN_EXTEND) + Opcode = Intrinsic::arm_neon_vpaddls; + else + Opcode = Intrinsic::arm_neon_vpaddlu; + Ops.push_back(DAG.getConstant(Opcode, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + EVT ElemTy = N00.getValueType().getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, + N00.getOperand(0), N00.getOperand(1)); + Ops.push_back(Concat); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); +} + +// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in +// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is +// much easier to match. +static SDValue +AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { // Only perform optimization if after legalize, and if NEON is available. We // also expected both operands to be BUILD_VECTORs. if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() @@ -9293,6 +9411,10 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } + // Don't generate vpaddl+vmovn; we'll match it to vpadd later. + if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) + return SDValue(); + // Create VPADDL node. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -9564,9 +9686,15 @@ static SDValue PerformADDCCombine(SDNode *N, static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget){ + // Attempt to create vpadd for this add. + if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) + return Result; // Attempt to create vpaddl for this add. - if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget)) + if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) + return Result; + if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, + Subtarget)) return Result; // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index 5255d82d647a..7a7f91f4d3c4 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -16,16 +16,28 @@ #define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H #include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Target/TargetLowering.h" -#include <vector> +#include <utility> namespace llvm { - class ARMConstantPoolValue; - class ARMSubtarget; + +class ARMSubtarget; +class InstrItineraryData; namespace ARMISD { + // ARM Specific DAG Nodes enum NodeType : unsigned { // Start the numbering where the builtin ops and target ops leave off. @@ -217,12 +229,15 @@ namespace llvm { VST3LN_UPD, VST4LN_UPD }; - } + + } // end namespace ARMISD /// Define some predicates that are used for node matching. namespace ARM { + bool isBitFieldInvertedMask(unsigned v); - } + + } // end namespace ARM //===--------------------------------------------------------------------===// // ARMTargetLowering - ARM Implementation of the TargetLowering interface @@ -531,6 +546,7 @@ namespace llvm { std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const; typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector; + void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, @@ -623,6 +639,7 @@ namespace llvm { return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( MachineBasicBlock *Entry, @@ -644,9 +661,8 @@ namespace llvm { unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable = false) const; - SDValue - LowerCall(TargetLowering::CallLoweringInfo &CLI, - SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; /// HandleByVal - Target-specific cleanup for ByVal support. void HandleByVal(CCState *, unsigned &, unsigned) const override; @@ -712,9 +728,12 @@ namespace llvm { }; namespace ARM { + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); - } -} -#endif // ARMISELLOWERING_H + } // end namespace ARM + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp index 9bd036a1eace..324087d670b5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -29,7 +29,33 @@ using namespace llvm; // into an ARMGenRegisterBankInfo.def (similar to AArch64). namespace llvm { namespace ARM { -RegisterBank GPRRegBank; +const uint32_t GPRCoverageData[] = { + // Classes 0-31 + (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) | + (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) | + (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) | + (1u << ARM::GPRnopc_and_hGPRRegClassID) | + (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) | + (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) | + (1u << ARM::hGPR_and_tcGPRRegClassID), + // Classes 32-63 + 0, + // Classes 64-96 + 0, + // FIXME: Some of the entries below this point can be safely removed once + // this is tablegenerated. It's only needed because of the hardcoded + // register class limit. + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData); RegisterBank *RegBanks[] = {&GPRRegBank}; RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank}; @@ -51,14 +77,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) return; AlreadyInit = true; - // Initialize the GPR bank. - createRegisterBank(ARM::GPRRegBankID, "GPRB"); - - addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRRegClassID, TRI); - addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRwithAPSRRegClassID, TRI); const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID); (void)RBGPR; assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up"); + + // Initialize the GPR bank. assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) && "Subclass not added?"); assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) && diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index cc001b596785..2b6b36bc3e68 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -433,7 +433,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, int ARMTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 731a5adf3d73..3c83cd92a61a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -114,7 +114,8 @@ public: TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h index 7fcb3ce45bbb..d95c16fc3caf 100644 --- a/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h @@ -54,7 +54,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) { + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()) { int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index 26e0f9a94368..f28e8b36fdbc 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -14,11 +14,13 @@ #include "MipsMachineFunction.h" #include "MipsRegisterInfo.h" #include "MipsTargetMachine.h" +#include "llvm/ADT/APInt.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -1456,9 +1458,12 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) { return Result; } -static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) { - return DAG.getConstant(Op->getConstantOperandVal(ImmOp), SDLoc(Op), - Op->getValueType(0)); +static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG, + bool IsSigned = false) { + return DAG.getConstant( + APInt(Op->getValueType(0).getScalarType().getSizeInBits(), + Op->getConstantOperandVal(ImmOp), IsSigned), + SDLoc(Op), Op->getValueType(0)); } static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue, @@ -1564,8 +1569,8 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) { SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - - switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) { + unsigned Intrinsic = cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue(); + switch (Intrinsic) { default: return SDValue(); case Intrinsic::mips_shilo: @@ -1635,6 +1640,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear) EVT VecTy = Op->getValueType(0); EVT EltTy = VecTy.getVectorElementType(); + if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits()) + report_fatal_error("Immediate out of range"); APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(), Op->getConstantOperandVal(3)); return DAG.getNode(ISD::VSELECT, DL, VecTy, @@ -1648,6 +1655,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear) EVT VecTy = Op->getValueType(0); EVT EltTy = VecTy.getVectorElementType(); + if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits()) + report_fatal_error("Immediate out of range"); APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(), Op->getConstantOperandVal(3)); return DAG.getNode(ISD::VSELECT, DL, VecTy, @@ -1741,7 +1750,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_ceqi_w: case Intrinsic::mips_ceqi_d: return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1), - lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ); + lowerMSASplatImm(Op, 2, DAG, true), ISD::SETEQ); case Intrinsic::mips_cle_s_b: case Intrinsic::mips_cle_s_h: case Intrinsic::mips_cle_s_w: @@ -1753,7 +1762,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_clei_s_w: case Intrinsic::mips_clei_s_d: return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1), - lowerMSASplatImm(Op, 2, DAG), ISD::SETLE); + lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLE); case Intrinsic::mips_cle_u_b: case Intrinsic::mips_cle_u_h: case Intrinsic::mips_cle_u_w: @@ -1777,7 +1786,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_clti_s_w: case Intrinsic::mips_clti_s_d: return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1), - lowerMSASplatImm(Op, 2, DAG), ISD::SETLT); + lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLT); case Intrinsic::mips_clt_u_b: case Intrinsic::mips_clt_u_h: case Intrinsic::mips_clt_u_w: @@ -1990,15 +1999,28 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_insve_b: case Intrinsic::mips_insve_h: case Intrinsic::mips_insve_w: - case Intrinsic::mips_insve_d: + case Intrinsic::mips_insve_d: { + // Report an error for out of range values. + int64_t Max; + switch (Intrinsic) { + case Intrinsic::mips_insve_b: Max = 15; break; + case Intrinsic::mips_insve_h: Max = 7; break; + case Intrinsic::mips_insve_w: Max = 3; break; + case Intrinsic::mips_insve_d: Max = 1; break; + default: llvm_unreachable("Unmatched intrinsic"); + } + int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); + if (Value < 0 || Value > Max) + report_fatal_error("Immediate out of range"); return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2), Op->getOperand(3), DAG.getConstant(0, DL, MVT::i32)); + } case Intrinsic::mips_ldi_b: case Intrinsic::mips_ldi_h: case Intrinsic::mips_ldi_w: case Intrinsic::mips_ldi_d: - return lowerMSASplatImm(Op, 1, DAG); + return lowerMSASplatImm(Op, 1, DAG, true); case Intrinsic::mips_lsa: case Intrinsic::mips_dlsa: { EVT ResTy = Op->getValueType(0); @@ -2032,7 +2054,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_maxi_s_w: case Intrinsic::mips_maxi_s_d: return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0), - Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG)); + Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true)); case Intrinsic::mips_maxi_u_b: case Intrinsic::mips_maxi_u_h: case Intrinsic::mips_maxi_u_w: @@ -2056,7 +2078,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_mini_s_w: case Intrinsic::mips_mini_s_d: return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0), - Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG)); + Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true)); case Intrinsic::mips_mini_u_b: case Intrinsic::mips_mini_u_h: case Intrinsic::mips_mini_u_w: @@ -2129,11 +2151,59 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_pcnt_w: case Intrinsic::mips_pcnt_d: return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1)); + case Intrinsic::mips_sat_s_b: + case Intrinsic::mips_sat_s_h: + case Intrinsic::mips_sat_s_w: + case Intrinsic::mips_sat_s_d: + case Intrinsic::mips_sat_u_b: + case Intrinsic::mips_sat_u_h: + case Intrinsic::mips_sat_u_w: + case Intrinsic::mips_sat_u_d: { + // Report an error for out of range values. + int64_t Max; + switch (Intrinsic) { + case Intrinsic::mips_sat_s_b: + case Intrinsic::mips_sat_u_b: Max = 7; break; + case Intrinsic::mips_sat_s_h: + case Intrinsic::mips_sat_u_h: Max = 15; break; + case Intrinsic::mips_sat_s_w: + case Intrinsic::mips_sat_u_w: Max = 31; break; + case Intrinsic::mips_sat_s_d: + case Intrinsic::mips_sat_u_d: Max = 63; break; + default: llvm_unreachable("Unmatched intrinsic"); + } + int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); + if (Value < 0 || Value > Max) + report_fatal_error("Immediate out of range"); + return SDValue(); + } case Intrinsic::mips_shf_b: case Intrinsic::mips_shf_h: - case Intrinsic::mips_shf_w: + case Intrinsic::mips_shf_w: { + int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); + if (Value < 0 || Value > 255) + report_fatal_error("Immediate out of range"); return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0), Op->getOperand(2), Op->getOperand(1)); + } + case Intrinsic::mips_sldi_b: + case Intrinsic::mips_sldi_h: + case Intrinsic::mips_sldi_w: + case Intrinsic::mips_sldi_d: { + // Report an error for out of range values. + int64_t Max; + switch (Intrinsic) { + case Intrinsic::mips_sldi_b: Max = 15; break; + case Intrinsic::mips_sldi_h: Max = 7; break; + case Intrinsic::mips_sldi_w: Max = 3; break; + case Intrinsic::mips_sldi_d: Max = 1; break; + default: llvm_unreachable("Unmatched intrinsic"); + } + int64_t Value = cast<ConstantSDNode>(Op->getOperand(3))->getSExtValue(); + if (Value < 0 || Value > Max) + report_fatal_error("Immediate out of range"); + return SDValue(); + } case Intrinsic::mips_sll_b: case Intrinsic::mips_sll_h: case Intrinsic::mips_sll_w: @@ -2176,6 +2246,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_srai_d: return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG)); + case Intrinsic::mips_srari_b: + case Intrinsic::mips_srari_h: + case Intrinsic::mips_srari_w: + case Intrinsic::mips_srari_d: { + // Report an error for out of range values. + int64_t Max; + switch (Intrinsic) { + case Intrinsic::mips_srari_b: Max = 7; break; + case Intrinsic::mips_srari_h: Max = 15; break; + case Intrinsic::mips_srari_w: Max = 31; break; + case Intrinsic::mips_srari_d: Max = 63; break; + default: llvm_unreachable("Unmatched intrinsic"); + } + int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); + if (Value < 0 || Value > Max) + report_fatal_error("Immediate out of range"); + return SDValue(); + } case Intrinsic::mips_srl_b: case Intrinsic::mips_srl_h: case Intrinsic::mips_srl_w: @@ -2188,6 +2276,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_srli_d: return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG)); + case Intrinsic::mips_srlri_b: + case Intrinsic::mips_srlri_h: + case Intrinsic::mips_srlri_w: + case Intrinsic::mips_srlri_d: { + // Report an error for out of range values. + int64_t Max; + switch (Intrinsic) { + case Intrinsic::mips_srlri_b: Max = 7; break; + case Intrinsic::mips_srlri_h: Max = 15; break; + case Intrinsic::mips_srlri_w: Max = 31; break; + case Intrinsic::mips_srlri_d: Max = 63; break; + default: llvm_unreachable("Unmatched intrinsic"); + } + int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); + if (Value < 0 || Value > Max) + report_fatal_error("Immediate out of range"); + return SDValue(); + } case Intrinsic::mips_subv_b: case Intrinsic::mips_subv_h: case Intrinsic::mips_subv_w: @@ -2219,7 +2325,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, } } -static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) { +static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr, + const MipsSubtarget &Subtarget) { SDLoc DL(Op); SDValue ChainIn = Op->getOperand(0); SDValue Address = Op->getOperand(2); @@ -2227,6 +2334,12 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) { EVT ResTy = Op->getValueType(0); EVT PtrTy = Address->getValueType(0); + // For N64 addresses have the underlying type MVT::i64. This intrinsic + // however takes an i32 signed constant offset. The actual type of the + // intrinsic is a scaled signed i10. + if (Subtarget.isABI_N64()) + Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset); + Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset); return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), /* Alignment = */ 16); @@ -2282,11 +2395,12 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::mips_ld_h: case Intrinsic::mips_ld_w: case Intrinsic::mips_ld_d: - return lowerMSALoadIntr(Op, DAG, Intr); + return lowerMSALoadIntr(Op, DAG, Intr, Subtarget); } } -static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) { +static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr, + const MipsSubtarget &Subtarget) { SDLoc DL(Op); SDValue ChainIn = Op->getOperand(0); SDValue Value = Op->getOperand(2); @@ -2294,6 +2408,12 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) { SDValue Offset = Op->getOperand(4); EVT PtrTy = Address->getValueType(0); + // For N64 addresses have the underlying type MVT::i64. This intrinsic + // however takes an i32 signed constant offset. The actual type of the + // intrinsic is a scaled signed i10. + if (Subtarget.isABI_N64()) + Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset); + Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset); return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), @@ -2310,7 +2430,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op, case Intrinsic::mips_st_h: case Intrinsic::mips_st_w: case Intrinsic::mips_st_d: - return lowerMSAStoreIntr(Op, DAG, Intr); + return lowerMSAStoreIntr(Op, DAG, Intr, Subtarget); } } @@ -3377,8 +3497,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); unsigned Wd = MI.getOperand(0).getReg(); unsigned Fs = MI.getOperand(1).getReg(); - unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); - unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + unsigned Wt1 = RegInfo.createVirtualRegister( + Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass + : &Mips::MSA128WEvensRegClass); + unsigned Wt2 = RegInfo.createVirtualRegister( + Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass + : &Mips::MSA128WEvensRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1); BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2) diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h index a2d670f8d39d..7fc0156216f5 100644 --- a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h +++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h @@ -27,7 +27,8 @@ class ManagedStringPool { SmallVector<std::string *, 8> Pool; public: - ManagedStringPool() {} + ManagedStringPool() = default; + ~ManagedStringPool() { SmallVectorImpl<std::string *>::iterator Current = Pool.begin(); while (Current != Pool.end()) { @@ -43,6 +44,6 @@ public: } }; -} +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 04c8d5c0443e..3c2594c77f45 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -12,42 +12,83 @@ // //===----------------------------------------------------------------------===// -#include "NVPTXAsmPrinter.h" #include "InstPrinter/NVPTXInstPrinter.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "MCTargetDesc/NVPTXMCAsmInfo.h" #include "NVPTX.h" -#include "NVPTXInstrInfo.h" +#include "NVPTXAsmPrinter.h" #include "NVPTXMCExpr.h" #include "NVPTXMachineFunctionInfo.h" #include "NVPTXRegisterInfo.h" +#include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" #include "cl_common_defines.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Mangler.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cassert> +#include <cstdint> +#include <cstring> +#include <new> #include <sstream> +#include <string> +#include <utility> +#include <vector> + using namespace llvm; #define DEPOTNAME "__local_depot" @@ -62,11 +103,11 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden, cl::desc("NVPTX Specific: Emit source line in ptx file"), cl::init(false)); -namespace { /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V /// depends. -void DiscoverDependentGlobals(const Value *V, - DenseSet<const GlobalVariable *> &Globals) { +static void +DiscoverDependentGlobals(const Value *V, + DenseSet<const GlobalVariable *> &Globals) { if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) Globals.insert(GV); else { @@ -80,11 +121,12 @@ void DiscoverDependentGlobals(const Value *V, /// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable /// instances to be emitted, but only after any dependents have been added -/// first. -void VisitGlobalVariableForEmission( - const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order, - DenseSet<const GlobalVariable *> &Visited, - DenseSet<const GlobalVariable *> &Visiting) { +/// first.s +static void +VisitGlobalVariableForEmission(const GlobalVariable *GV, + SmallVectorImpl<const GlobalVariable *> &Order, + DenseSet<const GlobalVariable *> &Visited, + DenseSet<const GlobalVariable *> &Visiting) { // Have we already visited this one? if (Visited.count(GV)) return; @@ -108,7 +150,6 @@ void VisitGlobalVariableForEmission( Visited.insert(GV); Visiting.erase(GV); } -} void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) { if (!EmitLineNumbers) @@ -369,7 +410,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { } else if (Ty->isAggregateType() || Ty->isVectorTy()) { unsigned totalsz = DL.getTypeAllocSize(Ty); unsigned retAlignment = 0; - if (!llvm::getAlign(*F, 0, retAlignment)) + if (!getAlign(*F, 0, retAlignment)) retAlignment = DL.getABITypeAlignment(Ty); O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz << "]"; @@ -401,7 +442,6 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { } } O << ") "; - return; } void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF, @@ -459,7 +499,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { MRI = &MF->getRegInfo(); F = MF->getFunction(); emitLinkageDirective(F, O); - if (llvm::isKernelFunction(*F)) + if (isKernelFunction(*F)) O << ".entry "; else { O << ".func "; @@ -470,7 +510,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { emitFunctionParamList(*MF, O); - if (llvm::isKernelFunction(*F)) + if (isKernelFunction(*F)) emitKernelFunctionDirectives(*F, O); OutStreamer->EmitRawText(O.str()); @@ -513,15 +553,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If none of reqntid* is specified, don't output reqntid directive. unsigned reqntidx, reqntidy, reqntidz; bool specified = false; - if (!llvm::getReqNTIDx(F, reqntidx)) + if (!getReqNTIDx(F, reqntidx)) reqntidx = 1; else specified = true; - if (!llvm::getReqNTIDy(F, reqntidy)) + if (!getReqNTIDy(F, reqntidy)) reqntidy = 1; else specified = true; - if (!llvm::getReqNTIDz(F, reqntidz)) + if (!getReqNTIDz(F, reqntidz)) reqntidz = 1; else specified = true; @@ -535,15 +575,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If none of maxntid* is specified, don't output maxntid directive. unsigned maxntidx, maxntidy, maxntidz; specified = false; - if (!llvm::getMaxNTIDx(F, maxntidx)) + if (!getMaxNTIDx(F, maxntidx)) maxntidx = 1; else specified = true; - if (!llvm::getMaxNTIDy(F, maxntidy)) + if (!getMaxNTIDy(F, maxntidy)) maxntidy = 1; else specified = true; - if (!llvm::getMaxNTIDz(F, maxntidz)) + if (!getMaxNTIDz(F, maxntidz)) maxntidz = 1; else specified = true; @@ -553,11 +593,11 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, << "\n"; unsigned mincta; - if (llvm::getMinCTASm(F, mincta)) + if (getMinCTASm(F, mincta)) O << ".minnctapersm " << mincta << "\n"; unsigned maxnreg; - if (llvm::getMaxNReg(F, maxnreg)) + if (getMaxNReg(F, maxnreg)) O << ".maxnreg " << maxnreg << "\n"; } @@ -617,12 +657,9 @@ void NVPTXAsmPrinter::printVecModifiedImmediate( llvm_unreachable("Unknown Modifier on immediate operand"); } - - void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) { - emitLinkageDirective(F, O); - if (llvm::isKernelFunction(*F)) + if (isKernelFunction(*F)) O << ".entry "; else O << ".func "; @@ -684,7 +721,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { if (!gv->hasInternalLinkage()) return false; PointerType *Pty = gv->getType(); - if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED) + if (Pty->getAddressSpace() != ADDRESS_SPACE_SHARED) return false; const Function *oneFunc = nullptr; @@ -699,7 +736,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { } static bool useFuncSeen(const Constant *C, - llvm::DenseMap<const Function *, bool> &seenMap) { + DenseMap<const Function *, bool> &seenMap) { for (const User *U : C->users()) { if (const Constant *cu = dyn_cast<Constant>(U)) { if (useFuncSeen(cu, seenMap)) @@ -719,7 +756,7 @@ static bool useFuncSeen(const Constant *C, } void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { - llvm::DenseMap<const Function *, bool> seenMap; + DenseMap<const Function *, bool> seenMap; for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { const Function *F = &*FI; @@ -1040,7 +1077,6 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, bool processDemoted) { - // Skip meta data if (GVar->hasSection()) { if (GVar->getSection() == "llvm.metadata") @@ -1069,13 +1105,13 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ".weak "; } - if (llvm::isTexture(*GVar)) { - O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n"; + if (isTexture(*GVar)) { + O << ".global .texref " << getTextureName(*GVar) << ";\n"; return; } - if (llvm::isSurface(*GVar)) { - O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n"; + if (isSurface(*GVar)) { + O << ".global .surfref " << getSurfaceName(*GVar) << ";\n"; return; } @@ -1088,8 +1124,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, return; } - if (llvm::isSampler(*GVar)) { - O << ".global .samplerref " << llvm::getSamplerName(*GVar); + if (isSampler(*GVar)) { + O << ".global .samplerref " << getSamplerName(*GVar); const Constant *Initializer = nullptr; if (GVar->hasInitializer()) @@ -1150,12 +1186,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } if (GVar->hasPrivateLinkage()) { - - if (!strncmp(GVar->getName().data(), "unrollpragma", 12)) + if (strncmp(GVar->getName().data(), "unrollpragma", 12) == 0) return; // FIXME - need better way (e.g. Metadata) to avoid generating this global - if (!strncmp(GVar->getName().data(), "filename", 8)) + if (strncmp(GVar->getName().data(), "filename", 8) == 0) return; if (GVar->use_empty()) return; @@ -1199,8 +1234,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Ptx allows variable initilization only for constant and global state // spaces. if (GVar->hasInitializer()) { - if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) { + if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) { const Constant *Initializer = GVar->getInitializer(); // 'undef' is treated as there is no value specified. if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) { @@ -1233,8 +1268,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, ElementSize = DL.getTypeStoreSize(ETy); // Ptx allows variable initilization only for constant and // global state spaces. - if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && + if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) && GVar->hasInitializer()) { const Constant *Initializer = GVar->getInitializer(); if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) { @@ -1285,7 +1320,6 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, default: llvm_unreachable("type not supported yet"); } - } O << ";\n"; } @@ -1305,16 +1339,16 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const { switch (AddressSpace) { - case llvm::ADDRESS_SPACE_LOCAL: + case ADDRESS_SPACE_LOCAL: O << "local"; break; - case llvm::ADDRESS_SPACE_GLOBAL: + case ADDRESS_SPACE_GLOBAL: O << "global"; break; - case llvm::ADDRESS_SPACE_CONST: + case ADDRESS_SPACE_CONST: O << "const"; break; - case llvm::ADDRESS_SPACE_SHARED: + case ADDRESS_SPACE_SHARED: O << "shared"; break; default: @@ -1363,7 +1397,6 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const { void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O) { - const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. @@ -1406,7 +1439,6 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, default: llvm_unreachable("type not supported yet"); } - return; } static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) { @@ -1450,7 +1482,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { Function::const_arg_iterator I, E; unsigned paramIndex = 0; bool first = true; - bool isKernelFunc = llvm::isKernelFunction(*F); + bool isKernelFunc = isKernelFunction(*F); bool isABI = (nvptxSubtarget->getSmVersion() >= 20); MVT thePointerTy = TLI->getPointerTy(DL); @@ -1533,13 +1565,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { default: O << ".ptr "; break; - case llvm::ADDRESS_SPACE_CONST: + case ADDRESS_SPACE_CONST: O << ".ptr .const "; break; - case llvm::ADDRESS_SPACE_SHARED: + case ADDRESS_SPACE_SHARED: O << ".ptr .shared "; break; - case llvm::ADDRESS_SPACE_GLOBAL: + case ADDRESS_SPACE_GLOBAL: O << ".ptr .global "; break; } @@ -1820,7 +1852,6 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) { void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer) { - const DataLayout &DL = getDataLayout(); if (isa<UndefValue>(CPV) || CPV->isNullValue()) { @@ -1985,7 +2016,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, // buildTypeNameMap - Run through symbol table looking for type names. // - bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) { switch (MI.getOpcode()) { default: @@ -2100,7 +2130,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) raw_string_ostream OS(S); OS << "Unsupported expression in static initializer: "; CE->printAsOperand(OS, /*PrintType=*/ false, - !MF ? 0 : MF->getFunction()->getParent()); + !MF ? nullptr : MF->getFunction()->getParent()); report_fatal_error(OS.str()); } @@ -2330,7 +2360,7 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier) { printOperand(MI, opNum, O); - if (Modifier && !strcmp(Modifier, "add")) { + if (Modifier && strcmp(Modifier, "add") == 0) { O << ", "; printOperand(MI, opNum + 1, O); } else { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index 3dcc0e358a14..8ec3476b8719 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -1,4 +1,4 @@ -//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===// +//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer ----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,17 +18,34 @@ #include "NVPTX.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Value.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/PassAnalysisSupport.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> #include <fstream> +#include <map> +#include <memory> +#include <string> +#include <vector> // The ptx syntax and format is very different from that usually seem in a .s // file, @@ -40,7 +57,8 @@ // (subclass of MCStreamer). namespace llvm { - class MCOperand; + +class MCOperand; class LineReader { private: @@ -49,14 +67,17 @@ private: char buff[512]; std::string theFileName; SmallVector<unsigned, 32> lineOffset; + public: LineReader(std::string filename) { theCurLine = 0; fstr.open(filename.c_str()); theFileName = filename; } - std::string fileName() { return theFileName; } + ~LineReader() { fstr.close(); } + + std::string fileName() { return theFileName; } std::string readLine(unsigned line); }; @@ -107,6 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { numSymbols = 0; EmitGeneric = AP.EmitGeneric; } + unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) { assert((curpos + Num) <= size); assert((curpos + Bytes) <= size); @@ -120,6 +142,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { } return curpos; } + unsigned addZeros(int Num) { assert((curpos + Num) <= size); for (int i = 0; i < Num; ++i) { @@ -128,12 +151,14 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { } return curpos; } + void addSymbol(const Value *GVar, const Value *GVarBeforeStripping) { symbolPosInBuffer.push_back(curpos); Symbols.push_back(GVar); SymbolsBeforeStripping.push_back(GVarBeforeStripping); numSymbols++; } + void print() { if (numSymbols == 0) { // print out in bytes @@ -267,7 +292,7 @@ private: std::map<Type *, std::string> TypeNameMap; // List of variables demoted to a function scope. - std::map<const Function *, std::vector<const GlobalVariable *> > localDecls; + std::map<const Function *, std::vector<const GlobalVariable *>> localDecls; // To record filename to ID mapping std::map<std::string, unsigned> filenameMap; @@ -292,7 +317,8 @@ private: bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const; - LineReader *reader; + LineReader *reader = nullptr; + LineReader *getReader(const std::string &); // Used to control the need to emit .generic() in the initializer of @@ -312,20 +338,17 @@ public: NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == - NVPTX::CUDA) { - CurrentBankselLabelInBasicBlock = ""; - reader = nullptr; - } + NVPTX::CUDA) {} - ~NVPTXAsmPrinter() { - if (!reader) - delete reader; + ~NVPTXAsmPrinter() override { + delete reader; } bool runOnMachineFunction(MachineFunction &F) override { nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>(); return AsmPrinter::runOnMachineFunction(F); } + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineLoopInfo>(); AsmPrinter::getAnalysisUsage(AU); @@ -338,6 +361,7 @@ public: DebugLoc prevDebugLoc; void emitLineNumberAsDotLoc(const MachineInstr &); }; -} // end of namespace -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2e4764feff11..7a760fd38d0f 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1,3 +1,4 @@ +//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// // // The LLVM Compiler Infrastructure // @@ -11,31 +12,55 @@ // //===----------------------------------------------------------------------===// -#include "NVPTXISelLowering.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" +#include "NVPTXISelLowering.h" +#include "NVPTXSection.h" +#include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" #include "NVPTXTargetObjectFile.h" #include "NVPTXUtilities.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#include "llvm/MC/MCSectionELF.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> #include <sstream> +#include <string> +#include <utility> +#include <vector> #undef DEBUG_TYPE #define DEBUG_TYPE "nvptx-lower" @@ -109,7 +134,6 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI) : TargetLowering(TM), nvTM(&TM), STI(STI) { - // always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather // then generating calls to memset, mempcy or memmove. @@ -981,7 +1005,7 @@ std::string NVPTXTargetLowering::getPrototype( unsigned align = 0; const CallInst *CallI = cast<CallInst>(CS->getInstruction()); // +1 because index 0 is reserved for return type alignment - if (!llvm::getAlign(*CallI, i + 1, align)) + if (!getAlign(*CallI, i + 1, align)) align = DL.getABITypeAlignment(Ty); unsigned sz = DL.getTypeAllocSize(Ty); O << ".param .align " << align << " .b8 "; @@ -1047,7 +1071,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, // With bitcast'd call targets, the instruction will be the call if (isa<CallInst>(CalleeI)) { // Check if we have call alignment metadata - if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align)) + if (getAlign(*cast<CallInst>(CalleeI), Idx, Align)) return Align; const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue(); @@ -1070,7 +1094,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, // Check for function alignment information if we found that the // ultimate target is a Function if (DirectCallee) - if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align)) + if (getAlign(*cast<Function>(DirectCallee), Idx, Align)) return Align; // Call is indirect or alignment information is not available, fall back to @@ -1747,7 +1771,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; if (VTBits == 32 && STI.getSmVersion() >= 35) { - // For 32bit and sm35, we can use the funnel shift 'shf' instruction. // {dHi, dLo} = {aHi, aLo} >> Amt // dHi = aHi >> Amt @@ -1761,7 +1784,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, return DAG.getMergeValues(Ops, dl); } else { - // {dHi, dLo} = {aHi, aLo} >> Amt // - if (Amt>=size) then // dLo = aHi >> (Amt-size) @@ -1809,7 +1831,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShAmt = Op.getOperand(2); if (VTBits == 32 && STI.getSmVersion() >= 35) { - // For 32bit and sm35, we can use the funnel shift 'shf' instruction. // {dHi, dLo} = {aHi, aLo} << Amt // dHi = shf.l.clamp aLo, aHi, Amt @@ -1823,7 +1844,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, return DAG.getMergeValues(Ops, dl); } else { - // {dHi, dLo} = {aHi, aLo} << Amt // - if (Amt>=size) then // dLo = aLo << Amt (all 0) @@ -2002,11 +2022,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { case 2: Opcode = NVPTXISD::StoreV2; break; - case 4: { + case 4: Opcode = NVPTXISD::StoreV4; break; } - } SmallVector<SDValue, 8> Ops; @@ -2140,7 +2159,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( theArgs[i], (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() : nullptr))) { - assert(llvm::isKernelFunction(*F) && + assert(isKernelFunction(*F) && "Only kernels can have image/sampler params"); InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); continue; @@ -2193,7 +2212,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( 0); assert(vtparts.size() > 0 && "empty aggregate type not expected"); bool aggregateIsPacked = false; - if (StructType *STy = llvm::dyn_cast<StructType>(Ty)) + if (StructType *STy = dyn_cast<StructType>(Ty)) aggregateIsPacked = STy->isPacked(); SDValue Arg = getParamSymbol(DAG, idx, PtrVT); @@ -2202,7 +2221,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( EVT partVT = vtparts[parti]; Value *srcValue = Constant::getNullValue( PointerType::get(partVT.getTypeForEVT(F->getContext()), - llvm::ADDRESS_SPACE_PARAM)); + ADDRESS_SPACE_PARAM)); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, DAG.getConstant(offsets[parti], dl, PtrVT)); @@ -2242,7 +2261,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (NumElts == 1) { // We only have one element, so just directly load it Value *SrcValue = Constant::getNullValue(PointerType::get( - EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); + EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); SDValue P = DAG.getLoad( EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())), @@ -2260,7 +2279,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // f32,f32 = load ... EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2); Value *SrcValue = Constant::getNullValue(PointerType::get( - VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); + VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); SDValue P = DAG.getLoad( VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), @@ -2301,7 +2320,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( for (unsigned i = 0; i < NumElts; i += VecSize) { Value *SrcValue = Constant::getNullValue( PointerType::get(VecVT.getTypeForEVT(F->getContext()), - llvm::ADDRESS_SPACE_PARAM)); + ADDRESS_SPACE_PARAM)); SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, DAG.getConstant(Ofst, dl, PtrVT)); SDValue P = DAG.getLoad( @@ -2335,7 +2354,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // If ABI, load from the param symbol SDValue Arg = getParamSymbol(DAG, idx, PtrVT); Value *srcValue = Constant::getNullValue(PointerType::get( - ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); + ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); SDValue p; if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) { ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? @@ -2424,7 +2443,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, DAG.getVTList(MVT::Other), Ops, EltVT, MachinePointerInfo()); - } else if (NumElts == 2) { // V2 store SDValue StoreVal0 = OutVals[0]; @@ -2558,7 +2576,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); } - void NVPTXTargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { @@ -3306,7 +3323,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = true; Info.align = 0; @@ -3326,7 +3343,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); @@ -3347,7 +3364,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); @@ -3410,17 +3427,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: { + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: Info.opc = getOpcForTextureInstr(Intrinsic); Info.memVT = MVT::v4f32; Info.ptrVal = nullptr; Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = 16; return true; - } + case Intrinsic::nvvm_tex_1d_v4s32_s32: case Intrinsic::nvvm_tex_1d_v4s32_f32: case Intrinsic::nvvm_tex_1d_level_v4s32_f32: @@ -3532,17 +3549,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: { + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: Info.opc = getOpcForTextureInstr(Intrinsic); Info.memVT = MVT::v4i32; Info.ptrVal = nullptr; Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = 16; return true; - } + case Intrinsic::nvvm_suld_1d_i8_clamp: case Intrinsic::nvvm_suld_1d_v2i8_clamp: case Intrinsic::nvvm_suld_1d_v4i8_clamp: @@ -3587,17 +3604,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_v4i8_zero: case Intrinsic::nvvm_suld_3d_i8_zero: case Intrinsic::nvvm_suld_3d_v2i8_zero: - case Intrinsic::nvvm_suld_3d_v4i8_zero: { + case Intrinsic::nvvm_suld_3d_v4i8_zero: Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i8; Info.ptrVal = nullptr; Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = 16; return true; - } + case Intrinsic::nvvm_suld_1d_i16_clamp: case Intrinsic::nvvm_suld_1d_v2i16_clamp: case Intrinsic::nvvm_suld_1d_v4i16_clamp: @@ -3642,17 +3659,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_v4i16_zero: case Intrinsic::nvvm_suld_3d_i16_zero: case Intrinsic::nvvm_suld_3d_v2i16_zero: - case Intrinsic::nvvm_suld_3d_v4i16_zero: { + case Intrinsic::nvvm_suld_3d_v4i16_zero: Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i16; Info.ptrVal = nullptr; Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = 16; return true; - } + case Intrinsic::nvvm_suld_1d_i32_clamp: case Intrinsic::nvvm_suld_1d_v2i32_clamp: case Intrinsic::nvvm_suld_1d_v4i32_clamp: @@ -3697,17 +3714,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_v4i32_zero: case Intrinsic::nvvm_suld_3d_i32_zero: case Intrinsic::nvvm_suld_3d_v2i32_zero: - case Intrinsic::nvvm_suld_3d_v4i32_zero: { + case Intrinsic::nvvm_suld_3d_v4i32_zero: Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i32; Info.ptrVal = nullptr; Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = 16; return true; - } + case Intrinsic::nvvm_suld_1d_i64_clamp: case Intrinsic::nvvm_suld_1d_v2i64_clamp: case Intrinsic::nvvm_suld_1d_array_i64_clamp: @@ -3737,18 +3754,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_i64_zero: case Intrinsic::nvvm_suld_2d_array_v2i64_zero: case Intrinsic::nvvm_suld_3d_i64_zero: - case Intrinsic::nvvm_suld_3d_v2i64_zero: { + case Intrinsic::nvvm_suld_3d_v2i64_zero: Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i64; Info.ptrVal = nullptr; Info.offset = 0; - Info.vol = 0; + Info.vol = false; Info.readMem = true; Info.writeMem = false; Info.align = 16; return true; } - } return false; } @@ -3760,7 +3776,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { - // AddrMode - This represents an addressing mode of: // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg // @@ -4059,7 +4074,7 @@ static SDValue PerformANDCombine(SDNode *N, } bool AddTo = false; - if (AExt.getNode() != 0) { + if (AExt.getNode() != nullptr) { // Re-insert the ext as a zext. Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), AExt.getValueType(), Val); @@ -4204,7 +4219,6 @@ static bool IsMulWideOperandDemotable(SDValue Op, static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned) { - OperandSignedness LHSSign; // The LHS operand must be a demotable op diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 92a88c7f2506..0fbb0448e4c4 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -144,7 +144,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; -def true : Predicate<"1">; +def true : Predicate<"true">; def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h index cad4f5668fdf..b0472de980fc 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h @@ -1,4 +1,4 @@ -//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===// +//===- NVPTXSection.h - NVPTX-specific section representation ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -14,18 +14,20 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H #define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H -#include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCSection.h" +#include "llvm/MC/SectionKind.h" namespace llvm { + /// Represents a section in PTX PTX does not have sections. We create this class /// in order to use the ASMPrint interface. /// class NVPTXSection final : public MCSection { virtual void anchor(); + public: NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {} - ~NVPTXSection() {} + ~NVPTXSection() = default; /// Override this as NVPTX has its own way of printing switching /// to a section. @@ -40,4 +42,4 @@ public: } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 6c68a2c9370d..eb357e0a4d50 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -11,41 +11,28 @@ // //===----------------------------------------------------------------------===// -#include "NVPTXTargetMachine.h" -#include "MCTargetDesc/NVPTXMCAsmInfo.h" #include "NVPTX.h" #include "NVPTXAllocaHoisting.h" #include "NVPTXLowerAggrCopies.h" +#include "NVPTXTargetMachine.h" #include "NVPTXTargetObjectFile.h" #include "NVPTXTargetTransformInfo.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Verifier.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" +#include <cassert> +#include <string> using namespace llvm; @@ -57,6 +44,7 @@ static cl::opt<bool> cl::init(false), cl::Hidden); namespace llvm { + void initializeNVVMIntrRangePass(PassRegistry&); void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); @@ -66,7 +54,8 @@ void initializeNVPTXInferAddressSpacesPass(PassRegistry &); void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); -} + +} // end namespace llvm extern "C" void LLVMInitializeNVPTXTarget() { // Register the target. @@ -109,7 +98,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, Reloc::PIC_, CM, OL), is64bit(is64bit), - TLOF(make_unique<NVPTXTargetObjectFile>()), + TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; @@ -118,7 +107,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -NVPTXTargetMachine::~NVPTXTargetMachine() {} +NVPTXTargetMachine::~NVPTXTargetMachine() = default; void NVPTXTargetMachine32::anchor() {} @@ -141,6 +130,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} namespace { + class NVPTXPassConfig : public TargetPassConfig { public: NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) @@ -170,6 +160,7 @@ private: // Add passes that perform straight-line scalar optimizations. void addStraightLineScalarOptimizationPasses(); }; + } // end anonymous namespace TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index dc367a90594a..69c59d0296ab 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -11,14 +11,13 @@ #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H #include "NVPTXSection.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/SectionKind.h" #include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { -class GlobalVariable; -class Module; class NVPTXTargetObjectFile : public TargetLoweringObjectFile { - public: NVPTXTargetObjectFile() { TextSection = nullptr; @@ -43,7 +42,7 @@ public: DwarfMacinfoSection = nullptr; } - virtual ~NVPTXTargetObjectFile(); + ~NVPTXTargetObjectFile() override; void Initialize(MCContext &ctx, const TargetMachine &TM) override { TargetLoweringObjectFile::Initialize(ctx, TM); @@ -52,7 +51,6 @@ public: BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS()); ReadOnlySection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly()); - StaticCtorSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); StaticDtorSection = @@ -102,4 +100,4 @@ public: } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 48928ee2d540..dd7707084948 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -115,7 +115,7 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index d953aa8a7199..b6c271ae4cbc 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -54,7 +54,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); }; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index f7785342b364..f94d1eab097d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -281,7 +281,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 8308086ccfaa..30ee2814aba1 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -71,7 +71,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 2081809def70..2d0a06af18ae 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -547,8 +547,26 @@ bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I, assert (isa<LoadInst>(I) || isa<StoreInst>(I)); Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() : I->getOperand(0)->getType()); - if (!isUInt<12>(Offset) && - (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy())) + bool IsFPAccess = MemAccessTy->isFloatingPointTy(); + bool IsVectorAccess = MemAccessTy->isVectorTy(); + + // A store of an extracted vector element will be combined into a VSTE type + // instruction. + if (!IsVectorAccess && isa<StoreInst>(I)) { + Value *DataOp = I->getOperand(0); + if (isa<ExtractElementInst>(DataOp)) + IsVectorAccess = true; + } + + // A load which gets inserted into a vector element will be combined into a + // VLE type instruction. + if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) { + User *LoadUser = *I->user_begin(); + if (isa<InsertElementInst>(LoadUser)) + IsVectorAccess = true; + } + + if (!isUInt<12>(Offset) && (IsFPAccess || IsVectorAccess)) return false; return true; diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp index e16ced1661a1..8a6d28490e8c 100644 --- a/contrib/llvm/lib/Target/TargetMachine.cpp +++ b/contrib/llvm/lib/Target/TargetMachine.cpp @@ -44,7 +44,7 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString, const TargetOptions &Options) : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr), - RequireStructuredCFG(false), Options(Options) { + RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) { if (EnableIPRA.getNumOccurrences()) this->Options.EnableIPRA = EnableIPRA; } @@ -63,14 +63,15 @@ bool TargetMachine::isPositionIndependent() const { /// \brief Reset the target options based on the function's attributes. // FIXME: This function needs to go away for a number of reasons: // a) global state on the TargetMachine is terrible in general, -// b) there's no default state here to keep, -// c) these target options should be passed only on the function +// b) these target options should be passed only on the function // and not on the TargetMachine (via TargetOptions) at all. void TargetMachine::resetTargetOptions(const Function &F) const { #define RESET_OPTION(X, Y) \ do { \ if (F.hasFnAttribute(Y)) \ Options.X = (F.getFnAttribute(Y).getValueAsString() == "true"); \ + else \ + Options.X = DefaultOptions.X; \ } while (0) RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad"); @@ -87,6 +88,8 @@ void TargetMachine::resetTargetOptions(const Function &F) const { Options.FPDenormalMode = FPDenormal::PreserveSign; else if (Denormal == "positive-zero") Options.FPDenormalMode = FPDenormal::PositiveZero; + else + Options.FPDenormalMode = DefaultOptions.FPDenormalMode; } /// Returns the code generation relocation model. The choices are static, PIC, diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 529540ea4ed2..bc7020fded8c 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -663,6 +663,9 @@ bool WebAssemblyFastISel::fastLowerArguments() { for (auto const &Arg : F->args()) MFI->addParam(getLegalType(getSimpleType(Arg.getType()))); + if (!F->getReturnType()->isVoidTy()) + MFI->addResult(getLegalType(getSimpleType(F->getReturnType()))); + return true; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index d5474a02ce01..adf904ee0269 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -62,12 +62,19 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() { // Recursively descend the def-use lists from V to find non-bitcast users of // bitcasts of V. static void FindUses(Value *V, Function &F, - SmallVectorImpl<std::pair<Use *, Function *>> &Uses) { + SmallVectorImpl<std::pair<Use *, Function *>> &Uses, + SmallPtrSetImpl<Constant *> &ConstantBCs) { for (Use &U : V->uses()) { if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) - FindUses(BC, F, Uses); - else if (U.get()->getType() != F.getType()) + FindUses(BC, F, Uses, ConstantBCs); + else if (U.get()->getType() != F.getType()) { + if (isa<Constant>(U.get())) { + // Only add constant bitcasts to the list once; they get RAUW'd + auto c = ConstantBCs.insert(cast<Constant>(U.get())); + if (!c.second) continue; + } Uses.push_back(std::make_pair(&U, &F)); + } } } @@ -122,10 +129,10 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) { bool FixFunctionBitcasts::runOnModule(Module &M) { SmallVector<std::pair<Use *, Function *>, 0> Uses; + SmallPtrSet<Constant *, 2> ConstantBCs; // Collect all the places that need wrappers. - for (Function &F : M) - FindUses(&F, F, Uses); + for (Function &F : M) FindUses(&F, F, Uses, ConstantBCs); DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index bf546dab5fbb..47aadf99e860 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) { unsigned WebAssemblyTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost( Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 2a2e3941f82d..f658609f8930 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -61,7 +61,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); /// @} diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index dc18a59a30ba..83a23d4ad680 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", "Use 8-bit divide for positive values less than 256">; -def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl", "HasSlowDivide64", "true", - "Use 16-bit divide for positive values less than 65536">; + "Use 32-bit divide for positive values less than 2^32">; def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; @@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureCMPXCHG16B, FeaturePOPCNT, FeatureAES, + FeatureSlowDivide64, FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, @@ -760,6 +761,42 @@ def : Proc<"bdver4", [ FeatureMWAITX ]>; +// TODO: The scheduler model falls to BTVER2 model. +// The znver1 model has to be put in place. +// Zen +def: ProcessorModel<"znver1", BtVer2Model, [ + FeatureADX, + FeatureAES, + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureCLFLUSHOPT, + FeatureCMPXCHG16B, + FeatureF16C, + FeatureFMA, + FeatureFSGSBase, + FeatureFXSR, + FeatureFastLZCNT, + FeatureLAHFSAHF, + FeatureLZCNT, + FeatureMMX, + FeatureMOVBE, + FeatureMWAITX, + FeaturePCLMUL, + FeaturePOPCNT, + FeaturePRFCHW, + FeatureRDRAND, + FeatureRDSEED, + FeatureSHA, + FeatureSMAP, + FeatureSSE4A, + FeatureSlowSHLD, + FeatureX87, + FeatureXSAVE, + FeatureXSAVEC, + FeatureXSAVEOPT, + FeatureXSAVES]>; + def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 8b66790679d9..8ab4c0616880 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -183,16 +183,6 @@ namespace { void PreprocessISelDAG() override; - inline bool immSext8(SDNode *N) const { - return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); - } - - // True if the 64-bit immediate fits in a 32-bit sign-extended field. - inline bool i64immSExt32(SDNode *N) const { - uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); - return (int64_t)v == (int32_t)v; - } - // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index db76ddf04c06..787dff99367e 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass expensive divides on Atom when compiling with O2. + // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) - addBypassSlowDiv(64, 16); + addBypassSlowDiv(64, 32); } if (Subtarget.isTargetKnownWindowsMSVC() || @@ -1280,6 +1280,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); // FIXME. This commands are available on SSE/AVX2, add relevant patterns. setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -1306,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - if (Subtarget.hasDQI()) { - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); - } + for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); @@ -8090,6 +8089,37 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, return Zeroable; } +// The Shuffle result is as follow: +// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. +// Each Zeroable's element correspond to a particular Mask's element. +// As described in computeZeroableShuffleElements function. +// +// The function looks for a sub-mask that the nonzero elements are in +// increasing order. If such sub-mask exist. The function returns true. +static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, + ArrayRef<int> Mask,const EVT &VectorType, + bool &IsZeroSideLeft) { + int NextElement = -1; + // Check if the Mask's nonzero elements are in increasing order. + for (int i = 0, e = Zeroable.size(); i < e; i++) { + // Checks if the mask's zeros elements are built from only zeros. + if (Mask[i] == -1) + return false; + if (Zeroable[i]) + continue; + // Find the lowest non zero element + if (NextElement == -1) { + NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; + IsZeroSideLeft = NextElement != 0; + } + // Exit if the mask's non zero elements are not in increasing order. + if (NextElement != Mask[i]) + return false; + NextElement++; + } + return true; +} + /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, @@ -8145,6 +8175,46 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl); + +// Function convertBitVectorToUnsigned - The function gets SmallBitVector +// as argument and convert him to unsigned. +// The output of the function is not(zeroable) +static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { + unsigned convertBit = 0; + for (int i = 0, e = Zeroable.size(); i < e; i++) + convertBit |= !(Zeroable[i]) << i; + return convertBit; +} + +// X86 has dedicated shuffle that can be lowered to VEXPAND +static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, + const SmallBitVector &Zeroable, + ArrayRef<int> Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsLeftZeroSide = true; + if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), + IsLeftZeroSide)) + return SDValue(); + unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); + unsigned NumElts = VT.getVectorNumElements(); + assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && + "Unexpected number of vector elements"); + SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), + Subtarget, DAG, DL); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); + SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; + return DAG.getNode(ISD::VSELECT, DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, @@ -12159,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. @@ -12222,12 +12297,17 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or VEXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) @@ -12328,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. @@ -12392,12 +12477,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or EXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) @@ -12754,6 +12844,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12796,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12832,6 +12928,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } @@ -12889,6 +12989,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -12953,6 +13057,10 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -13089,9 +13197,9 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: @@ -15187,13 +15295,13 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) - return SDValue(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 && + (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - assert(InVT.getVectorElementType() == MVT::i1); + if (InVT.getVectorElementType() != MVT::i1) + return SDValue(); // Extend VT if the target is 256 or 128bit vector and VLX is not supported. MVT ExtVT = VT; @@ -15910,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, } } + // Sometimes flags can be set either with an AND or with an SRL/SHL + // instruction. SRL/SHL variant should be preferred for masks longer than this + // number of bits. + const int ShiftToAndMaxMaskWidth = 32; + const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE); + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. @@ -15958,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. - if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && + if (ZeroCheck && Op->hasOneUse() && isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); @@ -15968,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, APInt Mask = ArithOp.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); - if (!Mask.isSignedIntN(32)) // Avoid large immediates. + if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) break; Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, dl, VT)); @@ -15977,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. + // because a TEST instruction will be better. However, AND should be + // preferred if the instruction can be combined into ANDN. if (!hasNonFlagsUse(Op)) { SDValue Op0 = ArithOp->getOperand(0); SDValue Op1 = ArithOp->getOperand(1); EVT VT = ArithOp.getValueType(); bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1); bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64; + bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI(); + + // If we cannot select an ANDN instruction, check if we can replace + // AND+IMM64 with a shift before giving up. This is possible for masks + // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag. + if (!isProperAndn) { + if (!ZeroCheck) + break; + + assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized"); + auto *CN = dyn_cast<ConstantSDNode>(Op1); + if (!CN) + break; + + const APInt &Mask = CN->getAPIntValue(); + if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) + break; // Prefer TEST instruction. + + unsigned BitWidth = Mask.getBitWidth(); + unsigned LeadingOnes = Mask.countLeadingOnes(); + unsigned TrailingZeros = Mask.countTrailingZeros(); + + if (LeadingOnes + TrailingZeros == BitWidth) { + assert(TrailingZeros < VT.getSizeInBits() && + "Shift amount should be less than the type width"); + MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); + SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy); + Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt); + break; + } + + unsigned LeadingZeros = Mask.countLeadingZeros(); + unsigned TrailingOnes = Mask.countTrailingOnes(); + + if (LeadingZeros + TrailingOnes == BitWidth) { + assert(LeadingZeros < VT.getSizeInBits() && + "Shift amount should be less than the type width"); + MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); + SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy); + Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt); + break; + } - // But if we can combine this into an ANDN operation, then create an AND - // now and allow it to be pattern matched into an ANDN. - if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType) break; + } } LLVM_FALLTHROUGH; case ISD::SUB: @@ -16008,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { - if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + if (!NeedTruncation && ZeroCheck) { if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG)) return EFLAGS; } @@ -17283,17 +17438,20 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, unsigned NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) - return SDValue(); - - if (VT.is512BitVector() && InVTElt != MVT::i1) { + if (VT.is512BitVector() && InVTElt != MVT::i1 && + (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } - assert (InVTElt == MVT::i1 && "Unexpected vector type"); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + if (InVTElt != MVT::i1) + return SDValue(); + + MVT ExtVT = VT; + if (!VT.is512BitVector() && !Subtarget.hasVLX()) + ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + SDValue V; if (Subtarget.hasDQI()) { V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); @@ -17302,7 +17460,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); - if (VT.is512BitVector()) + if (ExtVT == VT) return V; } @@ -18418,13 +18576,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { - SDValue Op0 = ShAmt.getOperand(0); - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); - ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64); + ShAmt = ShAmt.getOperand(0); + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); + ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); + ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else { SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; @@ -21643,14 +21801,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } if (VT == MVT::v16i8 || - (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) { + (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || + (VT == MVT::v64i8 && Subtarget.hasBWI())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { - // On SSE41 targets we make use of the fact that VSELECT lowers - // to PBLENDVB which selects bytes based just on the sign bit. - if (Subtarget.hasSSE41()) { + if (VT.is512BitVector()) { + // On AVX512BW targets we make use of the fact that VSELECT lowers + // to a masked blend which selects bytes based just on the sign bit + // extracted to a mask. + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + V0 = DAG.getBitcast(VT, V0); + V1 = DAG.getBitcast(VT, V1); + Sel = DAG.getBitcast(VT, Sel); + Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); + return DAG.getBitcast(SelVT, + DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + } else if (Subtarget.hasSSE41()) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); @@ -28633,17 +28803,20 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() != ISD::VSELECT) return SDValue(); + assert(CondVT.isVector() && "Vector select expects a vector selector!"); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); - // Check if the first operand is all zeros.This situation only - // applies to avx512. - if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) { + // Check if the first operand is all zeros and Cond type is vXi1. + // This situation only applies to avx512. + if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && + CondVT.getVectorElementType() == MVT::i1) { //Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); + DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()), + DL, CondVT)); //Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); } - assert(CondVT.isVector() && "Vector select expects a vector selector!"); // To use the condition operand as a bitwise mask, it must have elements that // are the same size as the select elements. Ie, the condition operand must @@ -29282,11 +29455,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Combine: +/// Combine brcond/cmov/setcc/.. based on comparing the result of +/// atomic_load_add to use EFLAGS produced by the addition +/// directly if possible. For example: +/// +/// (setcc (cmp (atomic_load_add x, -C) C), COND_E) +/// becomes: +/// (setcc (LADD x, -C), COND_E) +/// +/// and /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) -/// to: +/// becomes: /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) -/// i.e., reusing the EFLAGS produced by the LOCKed instruction. +/// /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG) { @@ -29295,7 +29476,13 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); - // This only applies to variations of the common case: + // Can't replace the cmp if it has more uses than the one we're looking at. + // FIXME: We would like to be able to handle this, but would need to make sure + // all uses were updated. + if (!Cmp.hasOneUse()) + return SDValue(); + + // This applies to variations of the common case: // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) @@ -29314,8 +29501,9 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, return SDValue(); auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); - if (!CmpRHSC || CmpRHSC->getZExtValue() != 0) + if (!CmpRHSC) return SDValue(); + APInt Comparand = CmpRHSC->getAPIntValue(); const unsigned Opc = CmpLHS.getOpcode(); @@ -29331,16 +29519,19 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, if (Opc == ISD::ATOMIC_LOAD_SUB) Addend = -Addend; - if (CC == X86::COND_S && Addend == 1) + if (Comparand == -Addend) { + // No change to CC. + } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) { CC = X86::COND_LE; - else if (CC == X86::COND_NS && Addend == 1) + } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) { CC = X86::COND_G; - else if (CC == X86::COND_G && Addend == -1) + } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) { CC = X86::COND_GE; - else if (CC == X86::COND_LE && Addend == -1) + } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) { CC = X86::COND_L; - else + } else { return SDValue(); + } SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), @@ -31083,10 +31274,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, /// Check if truncation with saturation form type \p SrcVT to \p DstVT /// is valid for the given \p Subtarget. -static bool -isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { +static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, + const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return false; + + // FIXME: Scalar type may be supported if we move it to vector register. + if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512) + return false; + EVT SrcElVT = SrcVT.getScalarType(); EVT DstElVT = DstVT.getScalarType(); if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) @@ -31098,40 +31294,69 @@ isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { return false; } +/// Return true if VPACK* instruction can be used for the given types +/// and it is avalable on \p Subtarget. +static bool +isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { + if (Subtarget.hasSSE2()) + // v16i16 -> v16i8 + if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8) + return true; + if (Subtarget.hasSSE41()) + // v8i32 -> v8i16 + if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16) + return true; + return false; +} + /// Detect a pattern of truncation with saturation: /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// Return the source value to be truncated or SDValue() if the pattern was not -/// matched or the unsupported on the current target. -static SDValue -detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) { +/// matched. +static SDValue detectUSatPattern(SDValue In, EVT VT) { if (In.getOpcode() != ISD::UMIN) return SDValue(); - EVT InVT = In.getValueType(); - // FIXME: Scalar type may be supported if we move it to vector register. - if (!InVT.isVector() || !InVT.isSimple()) - return SDValue(); - - if (!isSATValidOnSubtarget(InVT, VT, Subtarget)) - return SDValue(); - //Saturation with truncation. We truncate from InVT to VT. - assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && + assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); - SDValue SrcVal; APInt C; - if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C)) - SrcVal = In.getOperand(1); - else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) - SrcVal = In.getOperand(0); - else + if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { + // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // the element size of the destination type. + return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) : + SDValue(); + } + return SDValue(); +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// The types should allow to use VPMOVUS* instruction on AVX512. +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, + const X86Subtarget &Subtarget) { + if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); + return detectUSatPattern(In, VT); +} - // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according - // the element size of the destination type. - return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ? - SrcVal : SDValue(); +static SDValue +combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue USatVal = detectUSatPattern(In, VT); + if (USatVal) { + if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL); + return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi); + } + } + return SDValue(); } /// This function detects the AVG pattern between vectors of unsigned i8/i16, @@ -31701,7 +31926,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); if (SDValue Val = - detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) + detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); @@ -32326,9 +32551,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; - // Try the truncation with unsigned saturation. - if (SDValue Val = detectUSatPattern(Src, VT, Subtarget)) - return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val); + // Try to combine truncation with unsigned saturation. + if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget)) + return Val; // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index d44d1395f243..230d1700b8d2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5957,6 +5957,30 @@ let Predicates = [HasAVX512] in { (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>; } // Predicates = [HasAVX512] +// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang +// which produce unnecessary vmovs{s,d} instructions +let Predicates = [HasAVX512] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; +} // Predicates = [HasAVX512] + // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, @@ -6136,6 +6160,21 @@ def : Pat<(f32 (fpround FR64X:$src)), (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 09971d586a41..1812d01711d1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, InstrItinClass ri = arg_ri; } - // scalar let Sched = WriteFAdd in { def SSE_ALU_F32S : OpndItins< @@ -1923,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, } } // isCodeGenOnly = 1 +// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and +// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary +// vmovs{s,d} instructions +let Predicates = [UseAVX] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseAVX] + +let Predicates = [UseSSE2] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE2] + +let Predicates = [UseSSE1] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE1] + // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index 92c16214aa4a..d80dc4a9b5e8 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -216,7 +216,7 @@ protected: /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; - /// True if 16-bit divides are significantly faster than + /// True if 32-bit divides are significantly faster than /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 107ed9359376..5715d826862e 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { } int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry SLMCostTable[] = { + { ISD::MUL, MVT::v4i32, 11 }, // pmulld + { ISD::MUL, MVT::v8i16, 2 }, // pmullw + { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. + { ISD::FMUL, MVT::f64, 2 }, // mulsd + { ISD::FMUL, MVT::v2f64, 4 }, // mulpd + { ISD::FMUL, MVT::v4f32, 2 }, // mulps + { ISD::FDIV, MVT::f32, 17 }, // divss + { ISD::FDIV, MVT::v4f32, 39 }, // divps + { ISD::FDIV, MVT::f64, 32 }, // divsd + { ISD::FDIV, MVT::v2f64, 69 }, // divpd + { ISD::FADD, MVT::v2f64, 2 }, // addpd + { ISD::FSUB, MVT::v2f64, 2 }, // subpd + // v2i64/v4i64 mul is custom lowered as a series of long + // multiplies(3), shifts(3) and adds(2). + // slm muldq version throughput is 2 + { ISD::MUL, MVT::v2i64, 11 }, + }; + + if (ST->isSLM()) { + if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { + // Check if the operands can be shrinked into a smaller datatype. + bool Op1Signed = false; + unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); + bool Op2Signed = false; + unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); + + bool signedMode = Op1Signed | Op2Signed; + unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); + + if (OpMinSize <= 7) + return LT.first * 3; // pmullw/sext + if (!signedMode && OpMinSize <= 8) + return LT.first * 3; // pmullw/zext + if (OpMinSize <= 15) + return LT.first * 5; // pmullw/pmulhw/pshuf + if (!signedMode && OpMinSize <= 16) + return LT.first * 5; // pmullw/pmulhw/pshuf + } + if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, + LT.second)) { + return LT.first * Entry->Cost; + } + } + if (ISD == ISD::SDIV && Op2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { @@ -276,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw { ISD::SRA, MVT::v32i16, 1 }, // vpsravw + { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. + { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. + { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h index c013805f4321..ecaaf951cff7 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -60,7 +60,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); |